Total coverage: 73902 (5%)of 1820798
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org> Copyright (C) 2010 Google Inc. Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #ifndef __L2CAP_H #define __L2CAP_H #include <linux/unaligned.h> #include <linux/atomic.h> /* L2CAP defaults */ #define L2CAP_DEFAULT_MTU 672 #define L2CAP_DEFAULT_MIN_MTU 48 #define L2CAP_DEFAULT_FLUSH_TO 0xFFFF #define L2CAP_EFS_DEFAULT_FLUSH_TO 0xFFFFFFFF #define L2CAP_DEFAULT_TX_WINDOW 63 #define L2CAP_DEFAULT_EXT_WINDOW 0x3FFF #define L2CAP_DEFAULT_MAX_TX 3 #define L2CAP_DEFAULT_RETRANS_TO 2000 /* 2 seconds */ #define L2CAP_DEFAULT_MONITOR_TO 12000 /* 12 seconds */ #define L2CAP_DEFAULT_MAX_PDU_SIZE 1492 /* Sized for AMP packet */ #define L2CAP_DEFAULT_ACK_TO 200 #define L2CAP_DEFAULT_MAX_SDU_SIZE 0xFFFF #define L2CAP_DEFAULT_SDU_ITIME 0xFFFFFFFF #define L2CAP_DEFAULT_ACC_LAT 0xFFFFFFFF #define L2CAP_BREDR_MAX_PAYLOAD 1019 /* 3-DH5 packet */ #define L2CAP_LE_MIN_MTU 23 #define L2CAP_ECRED_CONN_SCID_MAX 5 #define L2CAP_DISC_TIMEOUT msecs_to_jiffies(100) #define L2CAP_DISC_REJ_TIMEOUT msecs_to_jiffies(5000) #define L2CAP_ENC_TIMEOUT msecs_to_jiffies(5000) #define L2CAP_CONN_TIMEOUT msecs_to_jiffies(40000) #define L2CAP_INFO_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_ERTX_TIMEOUT msecs_to_jiffies(60000) #define L2CAP_WAIT_ACK_POLL_PERIOD msecs_to_jiffies(200) #define L2CAP_WAIT_ACK_TIMEOUT msecs_to_jiffies(10000) /* L2CAP socket address */ struct sockaddr_l2 { sa_family_t l2_family; __le16 l2_psm; bdaddr_t l2_bdaddr; __le16 l2_cid; __u8 l2_bdaddr_type; }; /* L2CAP socket options */ #define L2CAP_OPTIONS 0x01 struct l2cap_options { __u16 omtu; __u16 imtu; __u16 flush_to; __u8 mode; __u8 fcs; __u8 max_tx; __u16 txwin_size; }; #define L2CAP_CONNINFO 0x02 struct l2cap_conninfo { __u16 hci_handle; __u8 dev_class[3]; }; #define L2CAP_LM 0x03 #define L2CAP_LM_MASTER 0x0001 #define L2CAP_LM_AUTH 0x0002 #define L2CAP_LM_ENCRYPT 0x0004 #define L2CAP_LM_TRUSTED 0x0008 #define L2CAP_LM_RELIABLE 0x0010 #define L2CAP_LM_SECURE 0x0020 #define L2CAP_LM_FIPS 0x0040 /* L2CAP command codes */ #define L2CAP_COMMAND_REJ 0x01 #define L2CAP_CONN_REQ 0x02 #define L2CAP_CONN_RSP 0x03 #define L2CAP_CONF_REQ 0x04 #define L2CAP_CONF_RSP 0x05 #define L2CAP_DISCONN_REQ 0x06 #define L2CAP_DISCONN_RSP 0x07 #define L2CAP_ECHO_REQ 0x08 #define L2CAP_ECHO_RSP 0x09 #define L2CAP_INFO_REQ 0x0a #define L2CAP_INFO_RSP 0x0b #define L2CAP_CONN_PARAM_UPDATE_REQ 0x12 #define L2CAP_CONN_PARAM_UPDATE_RSP 0x13 #define L2CAP_LE_CONN_REQ 0x14 #define L2CAP_LE_CONN_RSP 0x15 #define L2CAP_LE_CREDITS 0x16 #define L2CAP_ECRED_CONN_REQ 0x17 #define L2CAP_ECRED_CONN_RSP 0x18 #define L2CAP_ECRED_RECONF_REQ 0x19 #define L2CAP_ECRED_RECONF_RSP 0x1a /* L2CAP extended feature mask */ #define L2CAP_FEAT_FLOWCTL 0x00000001 #define L2CAP_FEAT_RETRANS 0x00000002 #define L2CAP_FEAT_BIDIR_QOS 0x00000004 #define L2CAP_FEAT_ERTM 0x00000008 #define L2CAP_FEAT_STREAMING 0x00000010 #define L2CAP_FEAT_FCS 0x00000020 #define L2CAP_FEAT_EXT_FLOW 0x00000040 #define L2CAP_FEAT_FIXED_CHAN 0x00000080 #define L2CAP_FEAT_EXT_WINDOW 0x00000100 #define L2CAP_FEAT_UCD 0x00000200 /* L2CAP checksum option */ #define L2CAP_FCS_NONE 0x00 #define L2CAP_FCS_CRC16 0x01 /* L2CAP fixed channels */ #define L2CAP_FC_SIG_BREDR 0x02 #define L2CAP_FC_CONNLESS 0x04 #define L2CAP_FC_ATT 0x10 #define L2CAP_FC_SIG_LE 0x20 #define L2CAP_FC_SMP_LE 0x40 #define L2CAP_FC_SMP_BREDR 0x80 /* L2CAP Control Field bit masks */ #define L2CAP_CTRL_SAR 0xC000 #define L2CAP_CTRL_REQSEQ 0x3F00 #define L2CAP_CTRL_TXSEQ 0x007E #define L2CAP_CTRL_SUPERVISE 0x000C #define L2CAP_CTRL_RETRANS 0x0080 #define L2CAP_CTRL_FINAL 0x0080 #define L2CAP_CTRL_POLL 0x0010 #define L2CAP_CTRL_FRAME_TYPE 0x0001 /* I- or S-Frame */ #define L2CAP_CTRL_TXSEQ_SHIFT 1 #define L2CAP_CTRL_SUPER_SHIFT 2 #define L2CAP_CTRL_POLL_SHIFT 4 #define L2CAP_CTRL_FINAL_SHIFT 7 #define L2CAP_CTRL_REQSEQ_SHIFT 8 #define L2CAP_CTRL_SAR_SHIFT 14 /* L2CAP Extended Control Field bit mask */ #define L2CAP_EXT_CTRL_TXSEQ 0xFFFC0000 #define L2CAP_EXT_CTRL_SAR 0x00030000 #define L2CAP_EXT_CTRL_SUPERVISE 0x00030000 #define L2CAP_EXT_CTRL_REQSEQ 0x0000FFFC #define L2CAP_EXT_CTRL_POLL 0x00040000 #define L2CAP_EXT_CTRL_FINAL 0x00000002 #define L2CAP_EXT_CTRL_FRAME_TYPE 0x00000001 /* I- or S-Frame */ #define L2CAP_EXT_CTRL_FINAL_SHIFT 1 #define L2CAP_EXT_CTRL_REQSEQ_SHIFT 2 #define L2CAP_EXT_CTRL_SAR_SHIFT 16 #define L2CAP_EXT_CTRL_SUPER_SHIFT 16 #define L2CAP_EXT_CTRL_POLL_SHIFT 18 #define L2CAP_EXT_CTRL_TXSEQ_SHIFT 18 /* L2CAP Supervisory Function */ #define L2CAP_SUPER_RR 0x00 #define L2CAP_SUPER_REJ 0x01 #define L2CAP_SUPER_RNR 0x02 #define L2CAP_SUPER_SREJ 0x03 /* L2CAP Segmentation and Reassembly */ #define L2CAP_SAR_UNSEGMENTED 0x00 #define L2CAP_SAR_START 0x01 #define L2CAP_SAR_END 0x02 #define L2CAP_SAR_CONTINUE 0x03 /* L2CAP Command rej. reasons */ #define L2CAP_REJ_NOT_UNDERSTOOD 0x0000 #define L2CAP_REJ_MTU_EXCEEDED 0x0001 #define L2CAP_REJ_INVALID_CID 0x0002 /* L2CAP structures */ struct l2cap_hdr { __le16 len; __le16 cid; } __packed; #define L2CAP_LEN_SIZE 2 #define L2CAP_HDR_SIZE 4 #define L2CAP_ENH_HDR_SIZE 6 #define L2CAP_EXT_HDR_SIZE 8 #define L2CAP_FCS_SIZE 2 #define L2CAP_SDULEN_SIZE 2 #define L2CAP_PSMLEN_SIZE 2 #define L2CAP_ENH_CTRL_SIZE 2 #define L2CAP_EXT_CTRL_SIZE 4 struct l2cap_cmd_hdr { __u8 code; __u8 ident; __le16 len; } __packed; #define L2CAP_CMD_HDR_SIZE 4 struct l2cap_cmd_rej_unk { __le16 reason; } __packed; struct l2cap_cmd_rej_mtu { __le16 reason; __le16 max_mtu; } __packed; struct l2cap_cmd_rej_cid { __le16 reason; __le16 scid; __le16 dcid; } __packed; struct l2cap_conn_req { __le16 psm; __le16 scid; } __packed; struct l2cap_conn_rsp { __le16 dcid; __le16 scid; __le16 result; __le16 status; } __packed; /* protocol/service multiplexer (PSM) */ #define L2CAP_PSM_SDP 0x0001 #define L2CAP_PSM_RFCOMM 0x0003 #define L2CAP_PSM_3DSP 0x0021 #define L2CAP_PSM_IPSP 0x0023 /* 6LoWPAN */ #define L2CAP_PSM_DYN_START 0x1001 #define L2CAP_PSM_DYN_END 0xffff #define L2CAP_PSM_AUTO_END 0x10ff #define L2CAP_PSM_LE_DYN_START 0x0080 #define L2CAP_PSM_LE_DYN_END 0x00ff /* channel identifier */ #define L2CAP_CID_SIGNALING 0x0001 #define L2CAP_CID_CONN_LESS 0x0002 #define L2CAP_CID_ATT 0x0004 #define L2CAP_CID_LE_SIGNALING 0x0005 #define L2CAP_CID_SMP 0x0006 #define L2CAP_CID_SMP_BREDR 0x0007 #define L2CAP_CID_DYN_START 0x0040 #define L2CAP_CID_DYN_END 0xffff #define L2CAP_CID_LE_DYN_END 0x007f /* connect/create channel results */ #define L2CAP_CR_SUCCESS 0x0000 #define L2CAP_CR_PEND 0x0001 #define L2CAP_CR_BAD_PSM 0x0002 #define L2CAP_CR_SEC_BLOCK 0x0003 #define L2CAP_CR_NO_MEM 0x0004 #define L2CAP_CR_INVALID_SCID 0x0006 #define L2CAP_CR_SCID_IN_USE 0x0007 /* credit based connect results */ #define L2CAP_CR_LE_SUCCESS 0x0000 #define L2CAP_CR_LE_BAD_PSM 0x0002 #define L2CAP_CR_LE_NO_MEM 0x0004 #define L2CAP_CR_LE_AUTHENTICATION 0x0005 #define L2CAP_CR_LE_AUTHORIZATION 0x0006 #define L2CAP_CR_LE_BAD_KEY_SIZE 0x0007 #define L2CAP_CR_LE_ENCRYPTION 0x0008 #define L2CAP_CR_LE_INVALID_SCID 0x0009 #define L2CAP_CR_LE_SCID_IN_USE 0X000A #define L2CAP_CR_LE_UNACCEPT_PARAMS 0X000B #define L2CAP_CR_LE_INVALID_PARAMS 0X000C /* connect/create channel status */ #define L2CAP_CS_NO_INFO 0x0000 #define L2CAP_CS_AUTHEN_PEND 0x0001 #define L2CAP_CS_AUTHOR_PEND 0x0002 struct l2cap_conf_req { __le16 dcid; __le16 flags; __u8 data[]; } __packed; struct l2cap_conf_rsp { __le16 scid; __le16 flags; __le16 result; __u8 data[]; } __packed; #define L2CAP_CONF_SUCCESS 0x0000 #define L2CAP_CONF_UNACCEPT 0x0001 #define L2CAP_CONF_REJECT 0x0002 #define L2CAP_CONF_UNKNOWN 0x0003 #define L2CAP_CONF_PENDING 0x0004 #define L2CAP_CONF_EFS_REJECT 0x0005 /* configuration req/rsp continuation flag */ #define L2CAP_CONF_FLAG_CONTINUATION 0x0001 struct l2cap_conf_opt { __u8 type; __u8 len; __u8 val[]; } __packed; #define L2CAP_CONF_OPT_SIZE 2 #define L2CAP_CONF_HINT 0x80 #define L2CAP_CONF_MASK 0x7f #define L2CAP_CONF_MTU 0x01 #define L2CAP_CONF_FLUSH_TO 0x02 #define L2CAP_CONF_QOS 0x03 #define L2CAP_CONF_RFC 0x04 #define L2CAP_CONF_FCS 0x05 #define L2CAP_CONF_EFS 0x06 #define L2CAP_CONF_EWS 0x07 #define L2CAP_CONF_MAX_SIZE 22 struct l2cap_conf_rfc { __u8 mode; __u8 txwin_size; __u8 max_transmit; __le16 retrans_timeout; __le16 monitor_timeout; __le16 max_pdu_size; } __packed; #define L2CAP_MODE_BASIC 0x00 #define L2CAP_MODE_RETRANS 0x01 #define L2CAP_MODE_FLOWCTL 0x02 #define L2CAP_MODE_ERTM 0x03 #define L2CAP_MODE_STREAMING 0x04 /* Unlike the above this one doesn't actually map to anything that would * ever be sent over the air. Therefore, use a value that's unlikely to * ever be used in the BR/EDR configuration phase. */ #define L2CAP_MODE_LE_FLOWCTL 0x80 #define L2CAP_MODE_EXT_FLOWCTL 0x81 struct l2cap_conf_efs { __u8 id; __u8 stype; __le16 msdu; __le32 sdu_itime; __le32 acc_lat; __le32 flush_to; } __packed; #define L2CAP_SERV_NOTRAFIC 0x00 #define L2CAP_SERV_BESTEFFORT 0x01 #define L2CAP_SERV_GUARANTEED 0x02 #define L2CAP_BESTEFFORT_ID 0x01 struct l2cap_disconn_req { __le16 dcid; __le16 scid; } __packed; struct l2cap_disconn_rsp { __le16 dcid; __le16 scid; } __packed; struct l2cap_info_req { __le16 type; } __packed; struct l2cap_info_rsp { __le16 type; __le16 result; __u8 data[]; } __packed; #define L2CAP_MR_SUCCESS 0x0000 #define L2CAP_MR_PEND 0x0001 #define L2CAP_MR_BAD_ID 0x0002 #define L2CAP_MR_SAME_ID 0x0003 #define L2CAP_MR_NOT_SUPP 0x0004 #define L2CAP_MR_COLLISION 0x0005 #define L2CAP_MR_NOT_ALLOWED 0x0006 struct l2cap_move_chan_cfm { __le16 icid; __le16 result; } __packed; #define L2CAP_MC_CONFIRMED 0x0000 #define L2CAP_MC_UNCONFIRMED 0x0001 struct l2cap_move_chan_cfm_rsp { __le16 icid; } __packed; /* info type */ #define L2CAP_IT_CL_MTU 0x0001 #define L2CAP_IT_FEAT_MASK 0x0002 #define L2CAP_IT_FIXED_CHAN 0x0003 /* info result */ #define L2CAP_IR_SUCCESS 0x0000 #define L2CAP_IR_NOTSUPP 0x0001 struct l2cap_conn_param_update_req { __le16 min; __le16 max; __le16 latency; __le16 to_multiplier; } __packed; struct l2cap_conn_param_update_rsp { __le16 result; } __packed; /* Connection Parameters result */ #define L2CAP_CONN_PARAM_ACCEPTED 0x0000 #define L2CAP_CONN_PARAM_REJECTED 0x0001 struct l2cap_le_conn_req { __le16 psm; __le16 scid; __le16 mtu; __le16 mps; __le16 credits; } __packed; struct l2cap_le_conn_rsp { __le16 dcid; __le16 mtu; __le16 mps; __le16 credits; __le16 result; } __packed; struct l2cap_le_credits { __le16 cid; __le16 credits; } __packed; #define L2CAP_ECRED_MIN_MTU 64 #define L2CAP_ECRED_MIN_MPS 64 #define L2CAP_ECRED_MAX_CID 5 struct l2cap_ecred_conn_req { /* New members must be added within the struct_group() macro below. */ __struct_group(l2cap_ecred_conn_req_hdr, hdr, __packed, __le16 psm; __le16 mtu; __le16 mps; __le16 credits; ); __le16 scid[]; } __packed; struct l2cap_ecred_conn_rsp { /* New members must be added within the struct_group() macro below. */ struct_group_tagged(l2cap_ecred_conn_rsp_hdr, hdr, __le16 mtu; __le16 mps; __le16 credits; __le16 result; ); __le16 dcid[]; }; struct l2cap_ecred_reconf_req { __le16 mtu; __le16 mps; __le16 scid[]; } __packed; #define L2CAP_RECONF_SUCCESS 0x0000 #define L2CAP_RECONF_INVALID_MTU 0x0001 #define L2CAP_RECONF_INVALID_MPS 0x0002 struct l2cap_ecred_reconf_rsp { __le16 result; } __packed; /* ----- L2CAP channels and connections ----- */ struct l2cap_seq_list { __u16 head; __u16 tail; __u16 mask; __u16 *list; }; #define L2CAP_SEQ_LIST_CLEAR 0xFFFF #define L2CAP_SEQ_LIST_TAIL 0x8000 struct l2cap_chan { struct l2cap_conn *conn; struct kref kref; atomic_t nesting; __u8 state; bdaddr_t dst; __u8 dst_type; bdaddr_t src; __u8 src_type; __le16 psm; __le16 sport; __u16 dcid; __u16 scid; __u16 imtu; __u16 omtu; __u16 flush_to; __u8 mode; __u8 chan_type; __u8 chan_policy; __u8 sec_level; __u8 ident; __u8 conf_req[64]; __u8 conf_len; __u8 num_conf_req; __u8 num_conf_rsp; __u8 fcs; __u16 tx_win; __u16 tx_win_max; __u16 ack_win; __u8 max_tx; __u16 retrans_timeout; __u16 monitor_timeout; __u16 mps; __u16 tx_credits; __u16 rx_credits; /* estimated available receive buffer space or -1 if unknown */ ssize_t rx_avail; __u8 tx_state; __u8 rx_state; unsigned long conf_state; unsigned long conn_state; unsigned long flags; __u16 next_tx_seq; __u16 expected_ack_seq; __u16 expected_tx_seq; __u16 buffer_seq; __u16 srej_save_reqseq; __u16 last_acked_seq; __u16 frames_sent; __u16 unacked_frames; __u8 retry_count; __u16 sdu_len; struct sk_buff *sdu; struct sk_buff *sdu_last_frag; __u16 remote_tx_win; __u8 remote_max_tx; __u16 remote_mps; __u8 local_id; __u8 local_stype; __u16 local_msdu; __u32 local_sdu_itime; __u32 local_acc_lat; __u32 local_flush_to; __u8 remote_id; __u8 remote_stype; __u16 remote_msdu; __u32 remote_sdu_itime; __u32 remote_acc_lat; __u32 remote_flush_to; struct delayed_work chan_timer; struct delayed_work retrans_timer; struct delayed_work monitor_timer; struct delayed_work ack_timer; struct sk_buff *tx_send_head; struct sk_buff_head tx_q; struct sk_buff_head srej_q; struct l2cap_seq_list srej_list; struct l2cap_seq_list retrans_list; struct list_head list; struct list_head global_l; void *data; const struct l2cap_ops *ops; struct mutex lock; }; struct l2cap_ops { char *name; struct l2cap_chan *(*new_connection) (struct l2cap_chan *chan); int (*recv) (struct l2cap_chan * chan, struct sk_buff *skb); void (*teardown) (struct l2cap_chan *chan, int err); void (*close) (struct l2cap_chan *chan); void (*state_change) (struct l2cap_chan *chan, int state, int err); void (*ready) (struct l2cap_chan *chan); void (*defer) (struct l2cap_chan *chan); void (*resume) (struct l2cap_chan *chan); void (*suspend) (struct l2cap_chan *chan); void (*set_shutdown) (struct l2cap_chan *chan); long (*get_sndtimeo) (struct l2cap_chan *chan); struct pid *(*get_peer_pid) (struct l2cap_chan *chan); struct sk_buff *(*alloc_skb) (struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb); int (*filter) (struct l2cap_chan * chan, struct sk_buff *skb); }; struct l2cap_conn { struct hci_conn *hcon; struct hci_chan *hchan; unsigned int mtu; __u32 feat_mask; __u8 remote_fixed_chan; __u8 local_fixed_chan; __u8 info_state; __u8 info_ident; struct delayed_work info_timer; struct sk_buff *rx_skb; __u32 rx_len; __u8 tx_ident; struct mutex ident_lock; struct sk_buff_head pending_rx; struct work_struct pending_rx_work; struct delayed_work id_addr_timer; __u8 disc_reason; struct l2cap_chan *smp; struct list_head chan_l; struct mutex chan_lock; struct kref ref; struct list_head users; }; struct l2cap_user { struct list_head list; int (*probe) (struct l2cap_conn *conn, struct l2cap_user *user); void (*remove) (struct l2cap_conn *conn, struct l2cap_user *user); }; #define L2CAP_INFO_CL_MTU_REQ_SENT 0x01 #define L2CAP_INFO_FEAT_MASK_REQ_SENT 0x04 #define L2CAP_INFO_FEAT_MASK_REQ_DONE 0x08 #define L2CAP_CHAN_RAW 1 #define L2CAP_CHAN_CONN_LESS 2 #define L2CAP_CHAN_CONN_ORIENTED 3 #define L2CAP_CHAN_FIXED 4 /* ----- L2CAP socket info ----- */ #define l2cap_pi(sk) ((struct l2cap_pinfo *) sk) struct l2cap_rx_busy { struct list_head list; struct sk_buff *skb; }; struct l2cap_pinfo { struct bt_sock bt; struct l2cap_chan *chan; struct list_head rx_busy; }; enum { CONF_REQ_SENT, CONF_INPUT_DONE, CONF_OUTPUT_DONE, CONF_MTU_DONE, CONF_MODE_DONE, CONF_CONNECT_PEND, CONF_RECV_NO_FCS, CONF_STATE2_DEVICE, CONF_EWS_RECV, CONF_LOC_CONF_PEND, CONF_REM_CONF_PEND, CONF_NOT_COMPLETE, }; #define L2CAP_CONF_MAX_CONF_REQ 2 #define L2CAP_CONF_MAX_CONF_RSP 2 enum { CONN_SREJ_SENT, CONN_WAIT_F, CONN_SREJ_ACT, CONN_SEND_PBIT, CONN_REMOTE_BUSY, CONN_LOCAL_BUSY, CONN_REJ_ACT, CONN_SEND_FBIT, CONN_RNR_SENT, }; /* Definitions for flags in l2cap_chan */ enum { FLAG_ROLE_SWITCH, FLAG_FORCE_ACTIVE, FLAG_FORCE_RELIABLE, FLAG_FLUSHABLE, FLAG_EXT_CTRL, FLAG_EFS_ENABLE, FLAG_DEFER_SETUP, FLAG_LE_CONN_REQ_SENT, FLAG_ECRED_CONN_REQ_SENT, FLAG_PENDING_SECURITY, FLAG_HOLD_HCI_CONN, }; /* Lock nesting levels for L2CAP channels. We need these because lockdep * otherwise considers all channels equal and will e.g. complain about a * connection oriented channel triggering SMP procedures or a listening * channel creating and locking a child channel. */ enum { L2CAP_NESTING_SMP, L2CAP_NESTING_NORMAL, L2CAP_NESTING_PARENT, }; enum { L2CAP_TX_STATE_XMIT, L2CAP_TX_STATE_WAIT_F, }; enum { L2CAP_RX_STATE_RECV, L2CAP_RX_STATE_SREJ_SENT, L2CAP_RX_STATE_MOVE, L2CAP_RX_STATE_WAIT_P, L2CAP_RX_STATE_WAIT_F, }; enum { L2CAP_TXSEQ_EXPECTED, L2CAP_TXSEQ_EXPECTED_SREJ, L2CAP_TXSEQ_UNEXPECTED, L2CAP_TXSEQ_UNEXPECTED_SREJ, L2CAP_TXSEQ_DUPLICATE, L2CAP_TXSEQ_DUPLICATE_SREJ, L2CAP_TXSEQ_INVALID, L2CAP_TXSEQ_INVALID_IGNORE, }; enum { L2CAP_EV_DATA_REQUEST, L2CAP_EV_LOCAL_BUSY_DETECTED, L2CAP_EV_LOCAL_BUSY_CLEAR, L2CAP_EV_RECV_REQSEQ_AND_FBIT, L2CAP_EV_RECV_FBIT, L2CAP_EV_RETRANS_TO, L2CAP_EV_MONITOR_TO, L2CAP_EV_EXPLICIT_POLL, L2CAP_EV_RECV_IFRAME, L2CAP_EV_RECV_RR, L2CAP_EV_RECV_REJ, L2CAP_EV_RECV_RNR, L2CAP_EV_RECV_SREJ, L2CAP_EV_RECV_FRAME, }; enum { L2CAP_MOVE_ROLE_NONE, L2CAP_MOVE_ROLE_INITIATOR, L2CAP_MOVE_ROLE_RESPONDER, }; enum { L2CAP_MOVE_STABLE, L2CAP_MOVE_WAIT_REQ, L2CAP_MOVE_WAIT_RSP, L2CAP_MOVE_WAIT_RSP_SUCCESS, L2CAP_MOVE_WAIT_CONFIRM, L2CAP_MOVE_WAIT_CONFIRM_RSP, L2CAP_MOVE_WAIT_LOGICAL_COMP, L2CAP_MOVE_WAIT_LOGICAL_CFM, L2CAP_MOVE_WAIT_LOCAL_BUSY, L2CAP_MOVE_WAIT_PREPARE, }; void l2cap_chan_hold(struct l2cap_chan *c); struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c); void l2cap_chan_put(struct l2cap_chan *c); static inline void l2cap_chan_lock(struct l2cap_chan *chan) { mutex_lock_nested(&chan->lock, atomic_read(&chan->nesting)); } static inline void l2cap_chan_unlock(struct l2cap_chan *chan) { mutex_unlock(&chan->lock); } static inline void l2cap_set_timer(struct l2cap_chan *chan, struct delayed_work *work, long timeout) { BT_DBG("chan %p state %s timeout %ld", chan, state_to_string(chan->state), timeout); /* If delayed work cancelled do not hold(chan) since it is already done with previous set_timer */ if (!cancel_delayed_work(work)) l2cap_chan_hold(chan); schedule_delayed_work(work, timeout); } static inline bool l2cap_clear_timer(struct l2cap_chan *chan, struct delayed_work *work) { bool ret; /* put(chan) if delayed work cancelled otherwise it is done in delayed work function */ ret = cancel_delayed_work(work); if (ret) l2cap_chan_put(chan); return ret; } #define __set_chan_timer(c, t) l2cap_set_timer(c, &c->chan_timer, (t)) #define __clear_chan_timer(c) l2cap_clear_timer(c, &c->chan_timer) #define __clear_retrans_timer(c) l2cap_clear_timer(c, &c->retrans_timer) #define __clear_monitor_timer(c) l2cap_clear_timer(c, &c->monitor_timer) #define __set_ack_timer(c) l2cap_set_timer(c, &chan->ack_timer, \ msecs_to_jiffies(L2CAP_DEFAULT_ACK_TO)); #define __clear_ack_timer(c) l2cap_clear_timer(c, &c->ack_timer) static inline int __seq_offset(struct l2cap_chan *chan, __u16 seq1, __u16 seq2) { if (seq1 >= seq2) return seq1 - seq2; else return chan->tx_win_max + 1 - seq2 + seq1; } static inline __u16 __next_seq(struct l2cap_chan *chan, __u16 seq) { return (seq + 1) % (chan->tx_win_max + 1); } static inline struct l2cap_chan *l2cap_chan_no_new_connection(struct l2cap_chan *chan) { return NULL; } static inline int l2cap_chan_no_recv(struct l2cap_chan *chan, struct sk_buff *skb) { return -ENOSYS; } static inline struct sk_buff *l2cap_chan_no_alloc_skb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { return ERR_PTR(-ENOSYS); } static inline void l2cap_chan_no_teardown(struct l2cap_chan *chan, int err) { } static inline void l2cap_chan_no_close(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_ready(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_state_change(struct l2cap_chan *chan, int state, int err) { } static inline void l2cap_chan_no_defer(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_suspend(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_resume(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_set_shutdown(struct l2cap_chan *chan) { } static inline long l2cap_chan_no_get_sndtimeo(struct l2cap_chan *chan) { return 0; } extern bool disable_ertm; extern bool enable_ecred; int l2cap_init_sockets(void); void l2cap_cleanup_sockets(void); bool l2cap_is_socket(struct socket *sock); void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan); void __l2cap_ecred_conn_rsp_defer(struct l2cap_chan *chan); void __l2cap_connect_rsp_defer(struct l2cap_chan *chan); int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm); int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid); struct l2cap_chan *l2cap_chan_create(void); void l2cap_chan_close(struct l2cap_chan *chan, int reason); int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst, u8 dst_type, u16 timeout); int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu); int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len); void l2cap_chan_busy(struct l2cap_chan *chan, int busy); void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail); int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator); void l2cap_chan_set_defaults(struct l2cap_chan *chan); int l2cap_ertm_init(struct l2cap_chan *chan); void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan); void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan); typedef void (*l2cap_chan_func_t)(struct l2cap_chan *chan, void *data); void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, void *data); void l2cap_chan_del(struct l2cap_chan *chan, int err); void l2cap_send_conn_req(struct l2cap_chan *chan); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user); #endif /* __L2CAP_H */
15 15 24 24 24 24 6 46 46 46 50 50 50 50 50 50 49 50 50 50 56 56 56 56 9 9 56 56 56 50 50 56 56 56 9 56 22 56 22 31 56 56 56 31 31 55 56 56 56 56 56 56 56 69 69 69 69 69 69 69 69 69 69 69 24 16 13 12 13 13 24 24 24 24 23 23 13 16 15 16 24 22 47 47 47 47 47 47 47 47 12 12 25 25 25 24 25 25 25 25 1 23 5 25 25 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation * * utilities for mac80211 */ #include <net/mac80211.h> #include <linux/netdevice.h> #include <linux/export.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/bitmap.h> #include <linux/crc32.h> #include <net/net_namespace.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <kunit/visibility.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" #include "led.h" #include "wep.h" /* privid for wiphys to determine whether they belong to us or not */ const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid; struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) { struct ieee80211_local *local; local = wiphy_priv(wiphy); return &local->hw; } EXPORT_SYMBOL(wiphy_to_ieee80211_hw); const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited = { .mode = IEEE80211_CONN_MODE_EHT, .bw_limit = IEEE80211_CONN_BW_LIMIT_320, }; u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type) { __le16 fc = hdr->frame_control; if (ieee80211_is_data(fc)) { if (len < 24) /* drop incorrect hdr len (data) */ return NULL; if (ieee80211_has_a4(fc)) return NULL; if (ieee80211_has_tods(fc)) return hdr->addr1; if (ieee80211_has_fromds(fc)) return hdr->addr2; return hdr->addr3; } if (ieee80211_is_s1g_beacon(fc)) { struct ieee80211_ext *ext = (void *) hdr; return ext->u.s1g_beacon.sa; } if (ieee80211_is_mgmt(fc)) { if (len < 24) /* drop incorrect hdr len (mgmt) */ return NULL; return hdr->addr3; } if (ieee80211_is_ctl(fc)) { if (ieee80211_is_pspoll(fc)) return hdr->addr1; if (ieee80211_is_back_req(fc)) { switch (type) { case NL80211_IFTYPE_STATION: return hdr->addr2; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: return hdr->addr1; default: break; /* fall through to the return */ } } } return NULL; } EXPORT_SYMBOL(ieee80211_get_bssid); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_hdr *hdr; skb_queue_walk(&tx->skbs, skb) { hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); } } int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble) { int dur; /* calculate duration (in microseconds, rounded up to next higher * integer if it includes a fractional microsecond) to send frame of * len bytes (does not include FCS) at the given rate. Duration will * also include SIFS. * * rate is in 100 kbps, so divident is multiplied by 10 in the * DIV_ROUND_UP() operations. */ if (band == NL80211_BAND_5GHZ || erp) { /* * OFDM: * * N_DBPS = DATARATE x 4 * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS) * (16 = SIGNAL time, 6 = tail bits) * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext * * T_SYM = 4 usec * 802.11a - 18.5.2: aSIFSTime = 16 usec * 802.11g - 19.8.4: aSIFSTime = 10 usec + * signal ext = 6 usec */ dur = 16; /* SIFS + signal ext */ dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */ dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */ /* rates should already consider the channel bandwidth, * don't apply divisor again. */ dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10, 4 * rate); /* T_SYM x N_SYM */ } else { /* * 802.11b or 802.11g with 802.11b compatibility: * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime + * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0. * * 802.11 (DS): 15.3.3, 802.11b: 18.3.4 * aSIFSTime = 10 usec * aPreambleLength = 144 usec or 72 usec with short preamble * aPLCPHeaderLength = 48 usec or 24 usec with short preamble */ dur = 10; /* aSIFSTime = 10 usec */ dur += short_preamble ? (72 + 24) : (144 + 48); dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate); } return dur; } /* Exported duration function for driver use */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate) { struct ieee80211_sub_if_data *sdata; u16 dur; int erp; bool short_preamble = false; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } dur = ieee80211_frame_duration(band, frame_len, rate->bitrate, erp, short_preamble); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_generic_frame_duration); __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } bitrate = rate->bitrate; /* CTS duration */ dur = ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); /* Data frame duration */ dur += ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble); /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_rts_duration); __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; struct ieee80211_sub_if_data *sdata; bool short_preamble; int erp, bitrate; u16 dur; struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[frame_txctl->band]; short_preamble = false; rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; erp = 0; if (vif) { sdata = vif_to_sdata(vif); short_preamble = sdata->vif.bss_conf.use_short_preamble; if (sdata->deflink.operating_11g_mode) erp = rate->flags & IEEE80211_RATE_ERP_G; } bitrate = rate->bitrate; /* Data frame duration */ dur = ieee80211_frame_duration(sband->band, frame_len, bitrate, erp, short_preamble); if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) { /* ACK duration */ dur += ieee80211_frame_duration(sband->band, 10, bitrate, erp, short_preamble); } return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_ctstoself_duration); static void wake_tx_push_queue(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_txq *queue) { struct ieee80211_tx_control control = { .sta = queue->sta, }; struct sk_buff *skb; while (1) { skb = ieee80211_tx_dequeue(&local->hw, queue); if (!skb) break; drv_tx(local, &control, skb); } } /* wake_tx_queue handler for driver not implementing a custom one*/ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); struct ieee80211_txq *queue; spin_lock(&local->handle_wake_tx_queue_lock); /* Use ieee80211_next_txq() for airtime fairness accounting */ ieee80211_txq_schedule_start(hw, txq->ac); while ((queue = ieee80211_next_txq(hw, txq->ac))) { wake_tx_push_queue(local, sdata, queue); ieee80211_return_txq(hw, queue, false); } ieee80211_txq_schedule_end(hw, txq->ac); spin_unlock(&local->handle_wake_tx_queue_lock); } EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue); static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) { struct ieee80211_local *local = sdata->local; struct ieee80211_vif *vif = &sdata->vif; struct fq *fq = &local->fq; struct ps_data *ps = NULL; struct txq_info *txqi; struct sta_info *sta; int i; local_bh_disable(); spin_lock(&fq->lock); if (!test_bit(SDATA_STATE_RUNNING, &sdata->state)) goto out; if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct ieee80211_txq *txq = sta->sta.txq[i]; if (!txq) continue; txqi = to_txq_info(txq); if (ac != txq->ac) continue; if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags)) continue; spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); spin_lock(&fq->lock); } } if (!vif->txq) goto out; txqi = to_txq_info(vif->txq); if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) || (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) goto out; spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); local_bh_enable(); return; out: spin_unlock(&fq->lock); local_bh_enable(); } static void __releases(&local->queue_stop_reason_lock) __acquires(&local->queue_stop_reason_lock) _ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags) { struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; int i; rcu_read_lock(); if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; for (i = 0; i < local->hw.queues; i++) { if (local->queue_stop_reasons[i]) continue; spin_unlock_irqrestore(&local->queue_stop_reason_lock, *flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { int ac; for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; if (ac_queue == i || sdata->vif.cab_queue == i) __ieee80211_wake_txqs(sdata, ac); } } spin_lock_irqsave(&local->queue_stop_reason_lock, *flags); } rcu_read_unlock(); } void ieee80211_wake_txqs(struct tasklet_struct *t) { struct ieee80211_local *local = from_tasklet(local, t, wake_txqs_tasklet); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); _ieee80211_wake_txqs(local, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted, unsigned long *flags) { struct ieee80211_local *local = hw_to_local(hw); trace_wake_queue(local, queue, reason); if (WARN_ON(queue >= hw->queues)) return; if (!test_bit(reason, &local->queue_stop_reasons[queue])) return; if (!refcounted) { local->q_stop_reasons[queue][reason] = 0; } else { local->q_stop_reasons[queue][reason]--; if (WARN_ON(local->q_stop_reasons[queue][reason] < 0)) local->q_stop_reasons[queue][reason] = 0; } if (local->q_stop_reasons[queue][reason] == 0) __clear_bit(reason, &local->queue_stop_reasons[queue]); if (local->queue_stop_reasons[queue] != 0) /* someone still has this queue stopped */ return; if (!skb_queue_empty(&local->pending[queue])) tasklet_schedule(&local->tx_pending_tasklet); /* * Calling _ieee80211_wake_txqs here can be a problem because it may * release queue_stop_reason_lock which has been taken by * __ieee80211_wake_queue's caller. It is certainly not very nice to * release someone's lock, but it is fine because all the callers of * __ieee80211_wake_queue call it right before releasing the lock. */ if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) tasklet_schedule(&local->wake_txqs_tasklet); else _ieee80211_wake_txqs(local, flags); } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_wake_queue(hw, queue, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue) { ieee80211_wake_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queue); static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); trace_stop_queue(local, queue, reason); if (WARN_ON(queue >= hw->queues)) return; if (!refcounted) local->q_stop_reasons[queue][reason] = 1; else local->q_stop_reasons[queue][reason]++; set_bit(reason, &local->queue_stop_reasons[queue]); } void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue) { ieee80211_stop_queue_by_reason(hw, queue, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_hw *hw = &local->hw; unsigned long flags; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int queue = info->hw_queue; if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); return; } spin_lock_irqsave(&local->queue_stop_reason_lock, flags); __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_add_pending_skbs(struct ieee80211_local *local, struct sk_buff_head *skbs) { struct ieee80211_hw *hw = &local->hw; struct sk_buff *skb; unsigned long flags; int queue, i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); while ((skb = skb_dequeue(skbs))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { ieee80211_free_txskb(&local->hw, skb); continue; } queue = info->hw_queue; __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false); __skb_queue_tail(&local->pending[queue], skb); } for (i = 0; i < hw->queues; i++) __ieee80211_wake_queue(hw, i, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, false, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_stop_queue(hw, i, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues(struct ieee80211_hw *hw) { ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_stop_queues); int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int ret; if (WARN_ON(queue >= hw->queues)) return true; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); ret = test_bit(IEEE80211_QUEUE_STOP_REASON_DRIVER, &local->queue_stop_reasons[queue]); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return ret; } EXPORT_SYMBOL(ieee80211_queue_stopped); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, enum queue_stop_reason reason, bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; int i; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) __ieee80211_wake_queue(hw, i, reason, refcounted, &flags); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queues(struct ieee80211_hw *hw) { ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_DRIVER, false); } EXPORT_SYMBOL(ieee80211_wake_queues); unsigned int ieee80211_get_vif_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { unsigned int queues; if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { int ac; queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (sdata->vif.hw_queue[ac] != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.hw_queue[ac]); if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE) queues |= BIT(sdata->vif.cab_queue); } else { /* all queues */ queues = BIT(local->hw.queues) - 1; } return queues; } void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop) { if (!local->ops->flush) return; /* * If no queue was set, or if the HW doesn't support * IEEE80211_HW_QUEUE_CONTROL - flush all queues */ if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) queues = ieee80211_get_vif_queues(local, sdata); ieee80211_stop_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); if (drop) { struct sta_info *sta; /* Purge the queues, so the frames on them won't be * sent during __ieee80211_wake_queue() */ list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; ieee80211_purge_sta_txqs(sta); } } drv_flush(local, sdata, queues, drop); ieee80211_wake_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, false); } void ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool drop) { __ieee80211_flush_queues(local, sdata, 0, drop); } static void __iterate_interfaces(struct ieee80211_local *local, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_sub_if_data *sdata; bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; list_for_each_entry_rcu(sdata, &local->interfaces, list, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; break; case NL80211_IFTYPE_AP_VLAN: continue; default: break; } if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; if (ieee80211_sdata_running(sdata) || !active_only) iterator(data, sdata->vif.addr, &sdata->vif); } sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); } void ieee80211_iterate_interfaces( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); mutex_lock(&local->iflist_mtx); __iterate_interfaces(local, iter_flags, iterator, data); mutex_unlock(&local->iflist_mtx); } EXPORT_SYMBOL_GPL(ieee80211_iterate_interfaces); void ieee80211_iterate_active_interfaces_atomic( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); void ieee80211_iterate_active_interfaces_mtx( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(hw->wiphy); __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct sta_info *sta; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (!sta->uploaded) continue; iterator(data, &sta->sta); } } void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); __iterate_stations(local, iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic); void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data) { struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(local->hw.wiphy); __iterate_stations(local, iterator, data); } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_mtx); struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!ieee80211_sdata_running(sdata) || !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) return NULL; return &sdata->vif; } EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { if (!vif) return NULL; return &vif_to_sdata(vif)->wdev; } EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev); /* * Nothing should have been stuffed into the workqueue during * the suspend->resume cycle. Since we can't check each caller * of this function if we are already quiescing / suspended, * check here and don't WARN since this can actually happen when * the rx path (for example) is racing against __ieee80211_suspend * and suspending / quiescing was set after the rx path checked * them. */ static bool ieee80211_can_queue_work(struct ieee80211_local *local) { if (local->quiescing || (local->suspended && !local->resuming)) { pr_warn("queueing ieee80211 work while going to suspend\n"); return false; } return true; } void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_work(local->workqueue, work); } EXPORT_SYMBOL(ieee80211_queue_work); void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, struct delayed_work *dwork, unsigned long delay) { struct ieee80211_local *local = hw_to_local(hw); if (!ieee80211_can_queue_work(local)) return; queue_delayed_work(local->workqueue, dwork, delay); } EXPORT_SYMBOL(ieee80211_queue_delayed_work); void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac) { struct ieee80211_chanctx_conf *chanctx_conf; const struct ieee80211_reg_rule *rrule; const struct ieee80211_wmm_ac *wmm_ac; u16 center_freq = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_STATION) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) center_freq = chanctx_conf->def.chan->center_freq; if (!center_freq) { rcu_read_unlock(); return; } rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq)); if (IS_ERR_OR_NULL(rrule) || !rrule->has_wmm) { rcu_read_unlock(); return; } if (sdata->vif.type == NL80211_IFTYPE_AP) wmm_ac = &rrule->wmm_rule.ap[ac]; else wmm_ac = &rrule->wmm_rule.client[ac]; qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); qparam->txop = min_t(u16, qparam->txop, wmm_ac->cot / 32); rcu_read_unlock(); } void ieee80211_set_wmm_default(struct ieee80211_link_data *link, bool bss_notify, bool enable_qos) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_tx_queue_params qparam; struct ieee80211_chanctx_conf *chanctx_conf; int ac; bool use_11b; bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */ int aCWmin, aCWmax; if (!local->ops->conf_tx) return; if (local->hw.queues < IEEE80211_NUM_ACS) return; memset(&qparam, 0, sizeof(qparam)); rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); use_11b = (chanctx_conf && chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) && !link->operating_11g_mode; rcu_read_unlock(); is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB); /* Set defaults according to 802.11-2007 Table 7-37 */ aCWmax = 1023; if (use_11b) aCWmin = 31; else aCWmin = 15; /* Configure old 802.11b/g medium access rules. */ qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; qparam.aifs = 2; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { /* Update if QoS is enabled. */ if (enable_qos) { switch (ac) { case IEEE80211_AC_BK: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 9; else qparam.aifs = 7; break; /* never happens but let's not leave undefined */ default: case IEEE80211_AC_BE: qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; if (is_ocb) qparam.aifs = 6; else qparam.aifs = 3; break; case IEEE80211_AC_VI: qparam.cw_max = aCWmin; qparam.cw_min = (aCWmin + 1) / 2 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 6016/32; else qparam.txop = 3008/32; if (is_ocb) qparam.aifs = 3; else qparam.aifs = 2; break; case IEEE80211_AC_VO: qparam.cw_max = (aCWmin + 1) / 2 - 1; qparam.cw_min = (aCWmin + 1) / 4 - 1; if (is_ocb) qparam.txop = 0; else if (use_11b) qparam.txop = 3264/32; else qparam.txop = 1504/32; qparam.aifs = 2; break; } } ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac); qparam.uapsd = false; link->tx_conf[ac] = qparam; drv_conf_tx(local, link, ac, &qparam); } if (sdata->vif.type != NL80211_IFTYPE_MONITOR && sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) { link->conf->qos = enable_qos; if (bss_notify) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *da, const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; bool multi_link = ieee80211_vif_is_mld(&sdata->vif); struct { u8 id; u8 len; u8 ext_id; struct ieee80211_multi_link_elem ml; struct ieee80211_mle_basic_common_info basic; } __packed mle = { .id = WLAN_EID_EXTENSION, .len = sizeof(mle) - 2, .ext_id = WLAN_EID_EXT_EHT_MULTI_LINK, .ml.control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC), .basic.len = sizeof(mle.basic), }; int err; memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN); /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN + multi_link * sizeof(mle)); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN); mgmt = skb_put_zero(skb, 24 + 6); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_AUTH); memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg); mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); mgmt->u.auth.status_code = cpu_to_le16(status); if (extra) skb_put_data(skb, extra, extra_len); if (multi_link) skb_put_data(skb, &mle, sizeof(mle)); if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx); if (WARN_ON(err)) { kfree_skb(skb); return; } } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | tx_flags; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt = (void *)frame_buf; /* build frame */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); mgmt->duration = 0; /* initialize only */ mgmt->seq_ctrl = 0; /* initialize only */ memcpy(mgmt->da, da, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, bssid, ETH_ALEN); /* u.deauth.reason_code == u.disassoc.reason_code */ mgmt->u.deauth.reason_code = cpu_to_le16(reason); if (send_frame) { skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_DEAUTH_FRAME_LEN); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); /* copy in frame */ skb_put_data(skb, mgmt, IEEE80211_DEAUTH_FRAME_LEN); if (sdata->vif.type != NL80211_IFTYPE_STATION || !(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } } static int ieee80211_put_s1g_cap(struct sk_buff *skb, struct ieee80211_sta_s1g_cap *s1g_cap) { if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_s1g_cap)) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_S1G_CAPABILITIES); skb_put_u8(skb, sizeof(struct ieee80211_s1g_cap)); skb_put_data(skb, &s1g_cap->cap, sizeof(s1g_cap->cap)); skb_put_data(skb, &s1g_cap->nss_mcs, sizeof(s1g_cap->nss_mcs)); return 0; } static int ieee80211_put_preq_ies_band(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const u8 *ie, size_t ie_len, size_t *offset, enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, u32 flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; int i, err; size_t noffset; u32 rate_flags; bool have_80mhz = false; *offset = 0; sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband)) return 0; rate_flags = ieee80211_chandef_rate_flags(chandef); /* For direct scan add S1G IE and consider its override bits */ if (band == NL80211_BAND_S1GHZ) return ieee80211_put_s1g_cap(skb, &sband->s1g_cap); err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, ~rate_mask, WLAN_EID_SUPP_RATES); if (err) return err; /* insert "request information" if in custom IEs */ if (ie && ie_len) { static const u8 before_extrates[] = { WLAN_EID_SSID, WLAN_EID_SUPP_RATES, WLAN_EID_REQUEST, }; noffset = ieee80211_ie_split(ie, ie_len, before_extrates, ARRAY_SIZE(before_extrates), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, ~rate_mask, WLAN_EID_EXT_SUPP_RATES); if (err) return err; if (chandef->chan && sband->band == NL80211_BAND_2GHZ) { if (skb_tailroom(skb) < 3) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_DS_PARAMS); skb_put_u8(skb, 1); skb_put_u8(skb, ieee80211_frequency_to_channel(chandef->chan->center_freq)); } if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) return 0; /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_DS_PARAMS, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; noffset = ieee80211_ie_split(ie, ie_len, before_ht, ARRAY_SIZE(before_ht), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (sband->ht_cap.ht_supported) { u8 *pos; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap)) return -ENOBUFS; pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap)); ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap); } /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { /* * no need to list the ones split off already * (or generated here) */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, WLAN_EID_MESH_ID, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } /* Check if any channel in this sband supports at least 80 MHz */ for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (sband->vht_cap.vht_supported && have_80mhz) { u8 *pos; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap)) return -ENOBUFS; pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap)); ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, sband->vht_cap.cap); } /* insert custom IEs that go before HE */ if (ie && ie_len) { static const u8 before_he[] = { /* * no need to list the ones split off before VHT * or generated here */ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, WLAN_EID_AP_CSN, /* TODO: add 11ah/11aj/11ak elements */ }; noffset = ieee80211_ie_split(ie, ie_len, before_he, ARRAY_SIZE(before_he), *offset); if (skb_tailroom(skb) < noffset - *offset) return -ENOBUFS; skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE)) { err = ieee80211_put_he_cap(skb, sdata, sband, NULL); if (err) return err; } if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE | IEEE80211_CHAN_NO_EHT)) { err = ieee80211_put_eht_cap(skb, sdata, sband, NULL); if (err) return err; } err = ieee80211_put_he_6ghz_cap(skb, sdata, IEEE80211_SMPS_OFF); if (err) return err; /* * If adding more here, adjust code in main.c * that calculates local->scan_ies_len. */ return 0; } static int ieee80211_put_preq_ies(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { size_t custom_ie_offset = 0; int i, err; memset(ie_desc, 0, sizeof(*ie_desc)); for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { ie_desc->ies[i] = skb_tail_pointer(skb); err = ieee80211_put_preq_ies_band(skb, sdata, ie, ie_len, &custom_ie_offset, i, rate_masks[i], chandef, flags); if (err) return err; ie_desc->len[i] = skb_tail_pointer(skb) - ie_desc->ies[i]; } } /* add any remaining custom IEs */ if (ie && ie_len) { if (WARN_ONCE(skb_tailroom(skb) < ie_len - custom_ie_offset, "not enough space for preq custom IEs\n")) return -ENOBUFS; ie_desc->common_ies = skb_tail_pointer(skb); skb_put_data(skb, ie + custom_ie_offset, ie_len - custom_ie_offset); ie_desc->common_ie_len = skb_tail_pointer(skb) - ie_desc->common_ies; } return 0; }; int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef, u32 flags) { struct sk_buff *skb = alloc_skb(buffer_len, GFP_KERNEL); uintptr_t offs; int ret, i; u8 *start; if (!skb) return -ENOMEM; start = skb_tail_pointer(skb); memset(start, 0, skb_tailroom(skb)); ret = ieee80211_put_preq_ies(skb, sdata, ie_desc, ie, ie_len, bands_used, rate_masks, chandef, flags); if (ret < 0) { goto out; } if (skb->len > buffer_len) { ret = -ENOBUFS; goto out; } memcpy(buffer, start, skb->len); /* adjust ie_desc for copy */ for (i = 0; i < NUM_NL80211_BANDS; i++) { offs = ie_desc->ies[i] - start; ie_desc->ies[i] = buffer + offs; } offs = ie_desc->common_ies - start; ie_desc->common_ies = buffer + offs; ret = skb->len; out: consume_skb(skb); return ret; } struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 flags) { struct ieee80211_local *local = sdata->local; struct cfg80211_chan_def chandef; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u32 rate_masks[NUM_NL80211_BANDS] = {}; struct ieee80211_scan_ies dummy_ie_desc; /* * Do not send DS Channel parameter for directed probe requests * in order to maximize the chance that we get a response. Some * badly-behaved APs don't respond when this parameter is included. */ chandef.width = sdata->vif.bss_conf.chanreq.oper.width; if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else chandef.chan = chan; skb = ieee80211_probereq_get(&local->hw, src, ssid, ssid_len, local->scan_ies_len + ie_len); if (!skb) return NULL; rate_masks[chan->band] = ratemask; ieee80211_put_preq_ies(skb, sdata, &dummy_ie_desc, ie, ie_len, BIT(chan->band), rate_masks, &chandef, flags); if (dst) { mgmt = (struct ieee80211_mgmt *) skb->data; memcpy(mgmt->da, dst, ETH_ALEN); memcpy(mgmt->bssid, dst, ETH_ALEN); } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; return skb; } u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates) { struct ieee80211_supported_band *sband; size_t num_rates; u32 supp_rates, rate_flags; int i, j; sband = sdata->local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return 1; rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + elems->ext_supp_rates_len; i++) { u8 rate = 0; int own_rate; bool is_basic; if (i < elems->supp_rates_len) rate = elems->supp_rates[i]; else if (elems->ext_supp_rates) rate = elems->ext_supp_rates [i - elems->supp_rates_len]; own_rate = 5 * (rate & 0x7f); is_basic = !!(rate & 0x80); if (is_basic && (rate & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY) continue; for (j = 0; j < num_rates; j++) { int brate; if ((rate_flags & sband->bitrates[j].flags) != rate_flags) continue; brate = sband->bitrates[j].bitrate; if (brate == own_rate) { supp_rates |= BIT(j); if (basic_rates && is_basic) *basic_rates |= BIT(j); } } } return supp_rates; } void ieee80211_stop_device(struct ieee80211_local *local, bool suspend) { local_bh_disable(); ieee80211_handle_queued_frames(local); local_bh_enable(); ieee80211_led_radio(local, false); ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); wiphy_work_cancel(local->hw.wiphy, &local->reconfig_filter); flush_workqueue(local->workqueue); wiphy_work_flush(local->hw.wiphy, NULL); drv_stop(local, suspend); } static void ieee80211_flush_completed_scan(struct ieee80211_local *local, bool aborted) { /* It's possible that we don't handle the scan completion in * time during suspend, so if it's still marked as completed * here, queue the work and flush it to clean things up. * Instead of calling the worker function directly here, we * really queue it to avoid potential races with other flows * scheduling the same work. */ if (test_bit(SCAN_COMPLETED, &local->scanning)) { /* If coming from reconfiguration failure, abort the scan so * we don't attempt to continue a partial HW scan - which is * possible otherwise if (e.g.) the 2.4 GHz portion was the * completed scan, and a 5 GHz portion is still pending. */ if (aborted) set_bit(SCAN_ABORTED, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); wiphy_delayed_work_flush(local->hw.wiphy, &local->scan_work); } } static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); /* * We get here if during resume the device can't be restarted properly. * We might also get here if this happens during HW reset, which is a * slightly different situation and we need to drop all connections in * the latter case. * * Ask cfg80211 to turn off all interfaces, this will result in more * warnings but at least we'll then get into a clean stopped state. */ local->resuming = false; local->suspended = false; local->in_reconfig = false; local->reconfig_failure = true; ieee80211_flush_completed_scan(local, true); /* scheduled scan clearly can't be running any more, but tell * cfg80211 and clear local state */ ieee80211_sched_scan_end(local); list_for_each_entry(sdata, &local->interfaces, list) sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; /* Mark channel contexts as not being in the driver any more to avoid * removing them from the driver during the shutdown process... */ list_for_each_entry(ctx, &local->chanctx_list, list) ctx->driver_present = false; } static void ieee80211_assign_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); if (conf) { ctx = container_of(conf, struct ieee80211_chanctx, conf); drv_assign_vif_chanctx(local, sdata, link->conf, ctx); } } static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); /* add STAs back */ list_for_each_entry(sta, &local->sta_list, list) { enum ieee80211_sta_state state; if (!sta->uploaded || sta->sdata != sdata) continue; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) WARN_ON(drv_sta_state(local, sta->sdata, sta, state, state + 1)); } } static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) { struct cfg80211_nan_func *func, **funcs; int res, id, i = 0; res = drv_start_nan(sdata->local, sdata, &sdata->u.nan.conf); if (WARN_ON(res)) return res; funcs = kcalloc(sdata->local->hw.max_nan_de_entries + 1, sizeof(*funcs), GFP_KERNEL); if (!funcs) return -ENOMEM; /* Add all the functions: * This is a little bit ugly. We need to call a potentially sleeping * callback for each NAN function, so we can't hold the spinlock. */ spin_lock_bh(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) funcs[i++] = func; spin_unlock_bh(&sdata->u.nan.func_lock); for (i = 0; funcs[i]; i++) { res = drv_add_nan_func(sdata->local, sdata, funcs[i]); if (WARN_ON(res)) ieee80211_nan_func_terminated(&sdata->vif, funcs[i]->instance_id, NL80211_NAN_FUNC_TERM_REASON_ERROR, GFP_KERNEL); } kfree(funcs); return 0; } static void ieee80211_reconfig_ap_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 changed) { int link_id; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; if (!(sdata->vif.active_links & BIT(link_id))) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (rcu_access_pointer(link->u.ap.beacon)) drv_start_ap(local, sdata, link->conf); if (!link->conf->enable_beacon) continue; changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_link_info_change_notify(sdata, link, changed); } } int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; struct ieee80211_sub_if_data *sdata; struct ieee80211_chanctx *ctx; struct sta_info *sta; int res, i; bool reconfig_due_to_wowlan = false; struct ieee80211_sub_if_data *sched_scan_sdata; struct cfg80211_sched_scan_request *sched_scan_req; bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; lockdep_assert_wiphy(local->hw.wiphy); /* nothing to do if HW shouldn't run */ if (!local->open_count) goto wake_up; #ifdef CONFIG_PM if (suspended) local->resuming = true; if (local->wowlan) { /* * In the wowlan case, both mac80211 and the device * are functional when the resume op is called, so * clear local->suspended so the device could operate * normally (e.g. pass rx frames). */ local->suspended = false; res = drv_resume(local); local->wowlan = false; if (res < 0) { local->resuming = false; return res; } if (res == 0) goto wake_up; WARN_ON(res > 1); /* * res is 1, which means the driver requested * to go through a regular reset on wakeup. * restore local->suspended in this case. */ reconfig_due_to_wowlan = true; local->suspended = true; } #endif /* * In case of hw_restart during suspend (without wowlan), * cancel restart work, as we are reconfiguring the device * anyway. * Note that restart_work is scheduled on a frozen workqueue, * so we can't deadlock in this case. */ if (suspended && local->in_reconfig && !reconfig_due_to_wowlan) cancel_work_sync(&local->restart_work); local->started = false; /* * Upon resume hardware can sometimes be goofy due to * various platform / driver / bus issues, so restarting * the device may at times not work immediately. Propagate * the error. */ res = drv_start(local); if (res) { if (suspended) WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n"); else WARN(1, "Hardware became unavailable during restart.\n"); ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); ieee80211_handle_reconfig_failure(local); return res; } /* setup fragmentation threshold */ drv_set_frag_threshold(local, hw->wiphy->frag_threshold); /* setup RTS threshold */ drv_set_rts_threshold(local, hw->wiphy->rts_threshold); /* reset coverage class */ drv_set_coverage_class(local, hw->wiphy->coverage_class); ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); /* add interfaces */ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { /* in HW restart it exists already */ WARN_ON(local->resuming); res = drv_add_interface(local, sdata); if (WARN_ON(res)) { RCU_INIT_POINTER(local->monitor_sdata, NULL); synchronize_net(); kfree(sdata); } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && ieee80211_sdata_running(sdata)) { res = drv_add_interface(local, sdata); if (WARN_ON(res)) break; } } /* If adding any of the interfaces failed above, roll back and * report failure. */ if (res) { list_for_each_entry_continue_reverse(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && ieee80211_sdata_running(sdata)) drv_remove_interface(local, sdata); } ieee80211_handle_reconfig_failure(local); return res; } /* add channel contexts */ list_for_each_entry(ctx, &local->chanctx_list, list) if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) WARN_ON(drv_add_chanctx(local, ctx)); sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_sdata_running(sdata)) ieee80211_assign_chanctx(local, sdata, &sdata->deflink); /* reconfigure hardware */ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | IEEE80211_CONF_CHANGE_MONITOR | IEEE80211_CONF_CHANGE_PS | IEEE80211_CONF_CHANGE_RETRY_LIMITS | IEEE80211_CONF_CHANGE_IDLE); ieee80211_configure_filter(local); /* Finally also reconfigure all the BSS information */ list_for_each_entry(sdata, &local->interfaces, list) { /* common change flags for all interface types - link only */ u64 changed = BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE | BSS_CHANGED_ERP_SLOT | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | BSS_CHANGED_BEACON_INT | BSS_CHANGED_BSSID | BSS_CHANGED_CQM | BSS_CHANGED_QOS | BSS_CHANGED_TXPOWER | BSS_CHANGED_MCAST_RATE; struct ieee80211_link_data *link = NULL; unsigned int link_id; u32 active_links = 0; if (!ieee80211_sdata_running(sdata)) continue; if (ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS] = { [0] = &sdata->vif.bss_conf, }; if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* start with a single active link */ active_links = sdata->vif.active_links; link_id = ffs(active_links) - 1; sdata->vif.active_links = BIT(link_id); } drv_change_vif_links(local, sdata, 0, sdata->vif.active_links, old); } sdata->restart_active_links = active_links; for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) { if (!ieee80211_vif_link_active(&sdata->vif, link_id)) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; ieee80211_assign_chanctx(local, sdata, link); } switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: break; case NL80211_IFTYPE_ADHOC: if (sdata->vif.cfg.ibss_joined) WARN_ON(drv_join_ibss(local, sdata)); fallthrough; default: ieee80211_reconfig_stations(sdata); fallthrough; case NL80211_IFTYPE_AP: /* AP stations are handled later */ for (i = 0; i < IEEE80211_NUM_ACS; i++) drv_conf_tx(local, &sdata->deflink, i, &sdata->deflink.tx_conf[i]); break; } if (sdata->vif.bss_conf.mu_mimo_owner) changed |= BSS_CHANGED_MU_GROUPS; if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= BSS_CHANGED_IDLE; switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (!ieee80211_vif_is_mld(&sdata->vif)) { changed |= BSS_CHANGED_ASSOC | BSS_CHANGED_ARP_FILTER | BSS_CHANGED_PS; /* Re-send beacon info report to the driver */ if (sdata->deflink.u.mgd.have_beacon) changed |= BSS_CHANGED_BEACON_INFO; if (sdata->vif.bss_conf.max_idle_period || sdata->vif.bss_conf.protected_keep_alive) changed |= BSS_CHANGED_KEEP_ALIVE; ieee80211_bss_info_change_notify(sdata, changed); } else if (!WARN_ON(!link)) { ieee80211_link_info_change_notify(sdata, link, changed); changed = BSS_CHANGED_ASSOC | BSS_CHANGED_IDLE | BSS_CHANGED_PS | BSS_CHANGED_ARP_FILTER; ieee80211_vif_cfg_change_notify(sdata, changed); } break; case NL80211_IFTYPE_OCB: changed |= BSS_CHANGED_OCB; ieee80211_bss_info_change_notify(sdata, changed); break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; fallthrough; case NL80211_IFTYPE_AP: changed |= BSS_CHANGED_P2P_PS; if (ieee80211_vif_is_mld(&sdata->vif)) ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); else changed |= BSS_CHANGED_SSID; if (sdata->vif.bss_conf.ftm_responder == 1 && wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER)) changed |= BSS_CHANGED_FTM_RESPONDER; if (sdata->vif.type == NL80211_IFTYPE_AP) { changed |= BSS_CHANGED_AP_PROBE_RESP; if (ieee80211_vif_is_mld(&sdata->vif)) { ieee80211_reconfig_ap_links(local, sdata, changed); break; } if (rcu_access_pointer(sdata->deflink.u.ap.beacon)) drv_start_ap(local, sdata, sdata->deflink.conf); } fallthrough; case NL80211_IFTYPE_MESH_POINT: if (sdata->vif.bss_conf.enable_beacon) { changed |= BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED; ieee80211_bss_info_change_notify(sdata, changed); } break; case NL80211_IFTYPE_NAN: res = ieee80211_reconfig_nan(sdata); if (res < 0) { ieee80211_handle_reconfig_failure(local); return res; } break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_WDS: WARN_ON(1); break; } } ieee80211_recalc_ps(local); /* * The sta might be in psm against the ap (e.g. because * this was the state before a hw restart), so we * explicitly send a null packet in order to make sure * it'll sync against the ap (and get out of psm). */ if (!(local->hw.conf.flags & IEEE80211_CONF_PS)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; if (!sdata->u.mgd.associated) continue; ieee80211_send_nullfunc(local, sdata, false); } } /* APs are now beaconing, add back stations */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: ieee80211_reconfig_stations(sdata); break; default: break; } } /* add back keys */ list_for_each_entry(sdata, &local->interfaces, list) ieee80211_reenable_keys(sdata); /* re-enable multi-link for client interfaces */ list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->restart_active_links) ieee80211_set_active_links(&sdata->vif, sdata->restart_active_links); /* * If a link switch was scheduled before the restart, and ran * before reconfig, it will do nothing, so re-schedule. */ if (sdata->desired_active_links) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->activate_links_work); } /* Reconfigure sched scan if it was interrupted by FW restart */ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); sched_scan_req = rcu_dereference_protected(local->sched_scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata && sched_scan_req) /* * Sched scan stopped, but we don't want to report it. Instead, * we're trying to reschedule. However, if more than one scan * plan was set, we cannot reschedule since we don't know which * scan plan was currently running (and some scan plans may have * already finished). */ if (sched_scan_req->n_scan_plans > 1 || __ieee80211_request_sched_scan_start(sched_scan_sdata, sched_scan_req)) { RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_stopped = true; } if (sched_scan_stopped) cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); wake_up: if (local->monitors == local->open_count && local->monitors > 0) ieee80211_add_virtual_monitor(local); /* * Clear the WLAN_STA_BLOCK_BA flag so new aggregation * sessions can be established after a resume. * * Also tear down aggregation sessions since reconfiguring * them in a hardware restart scenario is not easily done * right now, and the hardware will have lost information * about the sessions, but we and the AP still think they * are active. This is really a workaround though. */ if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { list_for_each_entry(sta, &local->sta_list, list) { if (!local->resuming) ieee80211_sta_tear_down_BA_sessions( sta, AGG_STOP_LOCAL_REQUEST); clear_sta_flag(sta, WLAN_STA_BLOCK_BA); } } /* * If this is for hw restart things are still running. * We may want to change that later, however. */ if (local->open_count && (!suspended || reconfig_due_to_wowlan)) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART); if (local->in_reconfig) { in_reconfig = local->in_reconfig; local->in_reconfig = false; barrier(); ieee80211_reconfig_roc(local); /* Requeue all works */ list_for_each_entry(sdata, &local->interfaces, list) wiphy_work_queue(local->hw.wiphy, &sdata->work); } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_SUSPEND, false); if (in_reconfig) { list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } } if (!suspended) return 0; #ifdef CONFIG_PM /* first set suspended false, then resuming */ local->suspended = false; mb(); local->resuming = false; ieee80211_flush_completed_scan(local, false); if (local->open_count && !reconfig_due_to_wowlan) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_restart(sdata); } mod_timer(&local->sta_cleanup, jiffies + 1); #else WARN_ON(1); #endif return 0; } static void ieee80211_reconfig_disconnect(struct ieee80211_vif *vif, u8 flag) { struct ieee80211_sub_if_data *sdata; struct ieee80211_local *local; struct ieee80211_key *key; if (WARN_ON(!vif)) return; sdata = vif_to_sdata(vif); local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_RESUME && !local->resuming)) return; if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_HW_RESTART && !local->in_reconfig)) return; if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return; sdata->flags |= flag; list_for_each_entry(key, &sdata->key_list, list) key->flags |= KEY_FLAG_TAINTED; } void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_HW_RESTART); } EXPORT_SYMBOL_GPL(ieee80211_hw_restart_disconnect); void ieee80211_resume_disconnect(struct ieee80211_vif *vif) { ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_RESUME); } EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect); void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; lockdep_assert_wiphy(local->hw.wiphy); chanctx_conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); /* * This function can be called from a work, thus it may be possible * that the chanctx_conf is removed (due to a disconnection, for * example). * So nothing should be done in such case. */ if (!chanctx_conf) return; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_smps_chanctx(local, chanctx); } void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, int link_id) { struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_chanctx *chanctx; int i; lockdep_assert_wiphy(local->hw.wiphy); for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) { struct ieee80211_bss_conf *bss_conf; if (link_id >= 0 && link_id != i) continue; rcu_read_lock(); bss_conf = rcu_dereference(sdata->vif.link_conf[i]); if (!bss_conf) { rcu_read_unlock(); continue; } chanctx_conf = rcu_dereference_protected(bss_conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); /* * Since we hold the wiphy mutex (checked above) * we can take the chanctx_conf pointer out of the * RCU critical section, it cannot go away without * the mutex. Just the way we reached it could - in * theory - go away, but we don't really care and * it really shouldn't happen anyway. */ rcu_read_unlock(); if (!chanctx_conf) return; chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); } } size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) { size_t pos = offset; while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC) pos += 2 + ies[pos + 1]; return pos; } u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap) { __le16 tmp; *pos++ = WLAN_EID_HT_CAPABILITY; *pos++ = sizeof(struct ieee80211_ht_cap); memset(pos, 0, sizeof(struct ieee80211_ht_cap)); /* capability flags */ tmp = cpu_to_le16(cap); memcpy(pos, &tmp, sizeof(u16)); pos += sizeof(u16); /* AMPDU parameters */ *pos++ = ht_cap->ampdu_factor | (ht_cap->ampdu_density << IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); /* MCS set */ memcpy(pos, &ht_cap->mcs, sizeof(ht_cap->mcs)); pos += sizeof(ht_cap->mcs); /* extended capabilities */ pos += sizeof(__le16); /* BF capabilities */ pos += sizeof(__le32); /* antenna selection */ pos += sizeof(u8); return pos; } u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap) { __le32 tmp; *pos++ = WLAN_EID_VHT_CAPABILITY; *pos++ = sizeof(struct ieee80211_vht_cap); memset(pos, 0, sizeof(struct ieee80211_vht_cap)); /* capability flags */ tmp = cpu_to_le32(cap); memcpy(pos, &tmp, sizeof(u32)); pos += sizeof(u32); /* VHT MCS set */ memcpy(pos, &vht_cap->vht_mcs, sizeof(vht_cap->vht_mcs)); pos += sizeof(vht_cap->vht_mcs); return pos; } /* this may return more than ieee80211_put_he_6ghz_cap() will need */ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); return 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); } static void ieee80211_get_adjusted_he_cap(const struct ieee80211_conn_settings *conn, const struct ieee80211_sta_he_cap *he_cap, struct ieee80211_he_cap_elem *elem) { u8 ru_limit, max_ru; *elem = he_cap->he_cap_elem; switch (conn->bw_limit) { case IEEE80211_CONN_BW_LIMIT_20: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242; break; case IEEE80211_CONN_BW_LIMIT_40: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484; break; case IEEE80211_CONN_BW_LIMIT_80: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996; break; default: ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996; break; } max_ru = elem->phy_cap_info[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; max_ru = min(max_ru, ru_limit); elem->phy_cap_info[8] &= ~IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; elem->phy_cap_info[8] |= max_ru; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G); elem->phy_cap_info[9] &= ~IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM; } if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G); elem->phy_cap_info[5] &= ~IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK; elem->phy_cap_info[7] &= ~(IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ | IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ); } } int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_he_cap_elem elem; u8 *len; u8 n; u8 ie_len; if (!conn) conn = &ieee80211_conn_settings_unlimited; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; /* modify on stack first to calculate 'n' and 'ie_len' correctly */ ieee80211_get_adjusted_he_cap(conn, he_cap, &elem); n = ieee80211_he_mcs_nss_size(&elem); ie_len = 2 + 1 + sizeof(he_cap->he_cap_elem) + n + ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); if (skb_tailroom(skb) < ie_len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); len = skb_put(skb, 1); /* We'll set the size later below */ skb_put_u8(skb, WLAN_EID_EXT_HE_CAPABILITY); /* Fixed data */ skb_put_data(skb, &elem, sizeof(elem)); skb_put_data(skb, &he_cap->he_mcs_nss_supp, n); /* Check if PPE Threshold should be present */ if ((he_cap->he_cap_elem.phy_cap_info[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) goto end; /* * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) */ n = hweight8(he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> IEEE80211_PPE_THRES_NSS_POS)); /* * Each pair is 6 bits, and we need to add the 7 "header" bits to the * total size. */ n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; n = DIV_ROUND_UP(n, 8); /* Copy PPE Thresholds */ skb_put_data(skb, &he_cap->ppe_thres, n); end: *len = skb_tail_pointer(skb) - len - 1; return 0; } int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode) { struct ieee80211_supported_band *sband; const struct ieee80211_sband_iftype_data *iftd; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); __le16 cap; if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy, BIT(NL80211_BAND_6GHZ), IEEE80211_CHAN_NO_HE)) return 0; sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (!iftd) return 0; /* Check for device HE 6 GHz capability before adding element */ if (!iftd->he_6ghz_capa.capa) return 0; cap = iftd->he_6ghz_capa.capa; cap &= cpu_to_le16(~IEEE80211_HE_6GHZ_CAP_SM_PS); switch (smps_mode) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_STATIC: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_DYNAMIC: cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, IEEE80211_HE_6GHZ_CAP_SM_PS); break; } if (skb_tailroom(skb) < 2 + 1 + sizeof(cap)) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, 1 + sizeof(cap)); skb_put_u8(skb, WLAN_EID_EXT_HE_6GHZ_CAPA); skb_put_data(skb, &cap, sizeof(cap)); return 0; } u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) { struct ieee80211_ht_operation *ht_oper; /* Build HT Information */ *pos++ = WLAN_EID_HT_OPERATION; *pos++ = sizeof(struct ieee80211_ht_operation); ht_oper = (struct ieee80211_ht_operation *)pos; ht_oper->primary_chan = ieee80211_frequency_to_channel( chandef->chan->center_freq); switch (chandef->width) { case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_80: case NL80211_CHAN_WIDTH_40: if (chandef->center_freq1 > chandef->chan->center_freq) ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_BELOW; break; case NL80211_CHAN_WIDTH_320: /* HT information element should not be included on 6GHz */ WARN_ON(1); return pos; default: ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE; break; } if (ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && chandef->width != NL80211_CHAN_WIDTH_20_NOHT && chandef->width != NL80211_CHAN_WIDTH_20) ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY; if (rifs_mode) ht_oper->ht_param |= IEEE80211_HT_PARAM_RIFS_MODE; ht_oper->operation_mode = cpu_to_le16(prot_mode); ht_oper->stbc_param = 0x0000; /* It seems that Basic MCS set and Supported MCS set are identical for the first 10 bytes */ memset(&ht_oper->basic_set, 0, 16); memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10); return pos + sizeof(struct ieee80211_ht_operation); } void ieee80211_ie_build_wide_bw_cs(u8 *pos, const struct cfg80211_chan_def *chandef) { *pos++ = WLAN_EID_WIDE_BW_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ /* New channel width */ switch (chandef->width) { case NL80211_CHAN_WIDTH_80: *pos++ = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_160: *pos++ = IEEE80211_VHT_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80P80: *pos++ = IEEE80211_VHT_CHANWIDTH_80P80MHZ; break; case NL80211_CHAN_WIDTH_320: /* The behavior is not defined for 320 MHz channels */ WARN_ON(1); fallthrough; default: *pos++ = IEEE80211_VHT_CHANWIDTH_USE_HT; } /* new center frequency segment 0 */ *pos++ = ieee80211_frequency_to_channel(chandef->center_freq1); /* new center frequency segment 1 */ if (chandef->center_freq2) *pos++ = ieee80211_frequency_to_channel(chandef->center_freq2); else *pos++ = 0; } u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef) { struct ieee80211_vht_operation *vht_oper; *pos++ = WLAN_EID_VHT_OPERATION; *pos++ = sizeof(struct ieee80211_vht_operation); vht_oper = (struct ieee80211_vht_operation *)pos; vht_oper->center_freq_seg0_idx = ieee80211_frequency_to_channel( chandef->center_freq1); if (chandef->center_freq2) vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel(chandef->center_freq2); else vht_oper->center_freq_seg1_idx = 0x00; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: /* * Convert 160 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; vht_oper->center_freq_seg1_idx = vht_oper->center_freq_seg0_idx; if (chandef->chan->center_freq < chandef->center_freq1) vht_oper->center_freq_seg0_idx -= 8; else vht_oper->center_freq_seg0_idx += 8; break; case NL80211_CHAN_WIDTH_80P80: /* * Convert 80+80 MHz channel width to new style as interop * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_320: /* VHT information element should not be included on 6GHz */ WARN_ON(1); return pos; default: vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; break; } /* don't require special VHT peer rates */ vht_oper->basic_mcs_set = cpu_to_le16(0xffff); return pos + sizeof(struct ieee80211_vht_operation); } u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; struct ieee80211_he_6ghz_oper *he_6ghz_op; u32 he_oper_params; u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); if (chandef->chan->band == NL80211_BAND_6GHZ) ie_len += sizeof(struct ieee80211_he_6ghz_oper); *pos++ = WLAN_EID_EXTENSION; *pos++ = ie_len; *pos++ = WLAN_EID_EXT_HE_OPERATION; he_oper_params = 0; he_oper_params |= u32_encode_bits(1023, /* disabled */ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_ER_SU_DISABLE); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); if (chandef->chan->band == NL80211_BAND_6GHZ) he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he_oper = (struct ieee80211_he_operation *)pos; he_oper->he_oper_params = cpu_to_le32(he_oper_params); /* don't require special HE peer rates */ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); pos += sizeof(struct ieee80211_he_operation); if (chandef->chan->band != NL80211_BAND_6GHZ) goto out; /* TODO add VHT operational */ he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; he_6ghz_op->minrate = 6; /* 6 Mbps */ he_6ghz_op->primary = ieee80211_frequency_to_channel(chandef->chan->center_freq); he_6ghz_op->ccfs0 = ieee80211_frequency_to_channel(chandef->center_freq1); if (chandef->center_freq2) he_6ghz_op->ccfs1 = ieee80211_frequency_to_channel(chandef->center_freq2); else he_6ghz_op->ccfs1 = 0; switch (chandef->width) { case NL80211_CHAN_WIDTH_320: /* * TODO: mesh operation is not defined over 6GHz 320 MHz * channels. */ WARN_ON(1); break; case NL80211_CHAN_WIDTH_160: /* Convert 160 MHz channel width to new style as interop * workaround. */ he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) he_6ghz_op->ccfs0 -= 8; else he_6ghz_op->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } pos += sizeof(struct ieee80211_he_6ghz_oper); out: return pos; } u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap) { const struct ieee80211_eht_mcs_nss_supp_20mhz_only *eht_mcs_nss = &eht_cap->eht_mcs_nss_supp.only_20mhz; struct ieee80211_eht_operation *eht_oper; struct ieee80211_eht_operation_info *eht_oper_info; u8 eht_oper_len = offsetof(struct ieee80211_eht_operation, optional); u8 eht_oper_info_len = offsetof(struct ieee80211_eht_operation_info, optional); u8 chan_width = 0; *pos++ = WLAN_EID_EXTENSION; *pos++ = 1 + eht_oper_len + eht_oper_info_len; *pos++ = WLAN_EID_EXT_EHT_OPERATION; eht_oper = (struct ieee80211_eht_operation *)pos; memcpy(&eht_oper->basic_mcs_nss, eht_mcs_nss, sizeof(*eht_mcs_nss)); eht_oper->params |= IEEE80211_EHT_OPER_INFO_PRESENT; pos += eht_oper_len; eht_oper_info = (struct ieee80211_eht_operation_info *)eht_oper->optional; eht_oper_info->ccfs0 = ieee80211_frequency_to_channel(chandef->center_freq1); if (chandef->center_freq2) eht_oper_info->ccfs1 = ieee80211_frequency_to_channel(chandef->center_freq2); else eht_oper_info->ccfs1 = 0; switch (chandef->width) { case NL80211_CHAN_WIDTH_320: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; eht_oper_info->ccfs1 = eht_oper_info->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) eht_oper_info->ccfs0 -= 16; else eht_oper_info->ccfs0 += 16; break; case NL80211_CHAN_WIDTH_160: eht_oper_info->ccfs1 = eht_oper_info->ccfs0; if (chandef->chan->center_freq < chandef->center_freq1) eht_oper_info->ccfs0 -= 8; else eht_oper_info->ccfs0 += 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ; break; default: chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ; break; } eht_oper_info->control = chan_width; pos += eht_oper_info_len; /* TODO: eht_oper_info->optional */ return pos; } bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef) { enum nl80211_channel_type channel_type; if (!ht_oper) return false; switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_NONE: channel_type = NL80211_CHAN_HT20; break; case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: channel_type = NL80211_CHAN_HT40PLUS; break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: channel_type = NL80211_CHAN_HT40MINUS; break; default: return false; } cfg80211_chandef_create(chandef, chandef->chan, channel_type); return true; } bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def new = *chandef; int cf0, cf1; int ccfs0, ccfs1, ccfs2; int ccf0, ccf1; u32 vht_cap; bool support_80_80 = false; bool support_160 = false; u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); u8 supp_chwidth = u32_get_bits(vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); if (!oper || !htop) return false; vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap; support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK | IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)); support_80_80 = ((vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >> IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1)); ccfs0 = oper->center_freq_seg0_idx; ccfs1 = oper->center_freq_seg1_idx; ccfs2 = (le16_to_cpu(htop->operation_mode) & IEEE80211_HT_OP_MODE_CCFS2_MASK) >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; ccf0 = ccfs0; /* if not supported, parse as though we didn't understand it */ if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) ext_nss_bw_supp = 0; /* * Cf. IEEE 802.11 Table 9-250 * * We really just consider that because it's inefficient to connect * at a higher bandwidth than we'll actually be able to use. */ switch ((supp_chwidth << 4) | ext_nss_bw_supp) { default: case 0x00: ccf1 = 0; support_160 = false; support_80_80 = false; break; case 0x01: support_80_80 = false; fallthrough; case 0x02: case 0x03: ccf1 = ccfs2; break; case 0x10: ccf1 = ccfs1; break; case 0x11: case 0x12: if (!ccfs1) ccf1 = ccfs2; else ccf1 = ccfs1; break; case 0x13: case 0x20: case 0x23: ccf1 = ccfs1; break; } cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); switch (oper->chan_width) { case IEEE80211_VHT_CHANWIDTH_USE_HT: /* just use HT information directly */ break; case IEEE80211_VHT_CHANWIDTH_80MHZ: new.width = NL80211_CHAN_WIDTH_80; new.center_freq1 = cf0; /* If needed, adjust based on the newer interop workaround. */ if (ccf1) { unsigned int diff; diff = abs(ccf1 - ccf0); if ((diff == 8) && support_160) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf1; } else if ((diff > 8) && support_80_80) { new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq2 = cf1; } } break; case IEEE80211_VHT_CHANWIDTH_160MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf0; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: /* deprecated encoding */ new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq1 = cf0; new.center_freq2 = cf1; break; default: return false; } if (!cfg80211_chandef_valid(&new)) return false; *chandef = new; return true; } void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, struct cfg80211_chan_def *chandef) { chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs0, chandef->chan->band); switch (u8_get_bits(info->control, IEEE80211_EHT_OPER_CHAN_WIDTH)) { case IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ: chandef->width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ: chandef->width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ: chandef->width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ: chandef->width = NL80211_CHAN_WIDTH_160; chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs1, chandef->chan->band); break; case IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ: chandef->width = NL80211_CHAN_WIDTH_320; chandef->center_freq1 = ieee80211_channel_to_frequency(info->ccfs1, chandef->chan->band); break; } } bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef) { struct cfg80211_chan_def he_chandef = *chandef; const struct ieee80211_he_6ghz_oper *he_6ghz_oper; u32 freq; if (chandef->chan->band != NL80211_BAND_6GHZ) return true; if (!he_oper) return false; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); if (!he_6ghz_oper) return false; /* * The EHT operation IE does not contain the primary channel so the * primary channel frequency should be taken from the 6 GHz operation * information. */ freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, NL80211_BAND_6GHZ); he_chandef.chan = ieee80211_get_channel(local->hw.wiphy, freq); if (!he_chandef.chan) return false; if (!eht_oper || !(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) { switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: he_chandef.width = NL80211_CHAN_WIDTH_20; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ: he_chandef.width = NL80211_CHAN_WIDTH_40; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ: he_chandef.width = NL80211_CHAN_WIDTH_80; if (!he_6ghz_oper->ccfs1) break; if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) he_chandef.width = NL80211_CHAN_WIDTH_160; else he_chandef.width = NL80211_CHAN_WIDTH_80P80; break; } if (he_chandef.width == NL80211_CHAN_WIDTH_160) { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } else { he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, NL80211_BAND_6GHZ); he_chandef.center_freq2 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, NL80211_BAND_6GHZ); } } else { ieee80211_chandef_eht_oper((const void *)eht_oper->optional, &he_chandef); he_chandef.punctured = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); } if (!cfg80211_chandef_valid(&he_chandef)) return false; *chandef = he_chandef; return true; } bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { u32 oper_freq; if (!oper) return false; switch (FIELD_GET(S1G_OPER_CH_WIDTH_OPER, oper->ch_width)) { case IEEE80211_S1G_CHANWIDTH_1MHZ: chandef->width = NL80211_CHAN_WIDTH_1; break; case IEEE80211_S1G_CHANWIDTH_2MHZ: chandef->width = NL80211_CHAN_WIDTH_2; break; case IEEE80211_S1G_CHANWIDTH_4MHZ: chandef->width = NL80211_CHAN_WIDTH_4; break; case IEEE80211_S1G_CHANWIDTH_8MHZ: chandef->width = NL80211_CHAN_WIDTH_8; break; case IEEE80211_S1G_CHANWIDTH_16MHZ: chandef->width = NL80211_CHAN_WIDTH_16; break; default: return false; } oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch, NL80211_BAND_S1GHZ); chandef->center_freq1 = KHZ_TO_MHZ(oper_freq); chandef->freq1_offset = oper_freq % 1000; return true; } int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, u32 basic_rates, u32 rate_flags, u32 masked_rates, u8 element_id) { u8 i, rates, skip; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (masked_rates & BIT(i)) continue; rates++; } if (element_id == WLAN_EID_SUPP_RATES) { rates = min_t(u8, rates, 8); skip = 0; } else { skip = 8; if (rates <= skip) return 0; rates -= skip; } if (skb_tailroom(skb) < rates + 2) return -ENOBUFS; skb_put_u8(skb, element_id); skb_put_u8(skb, rates); for (i = 0; i < sband->n_bitrates && rates; i++) { int rate; u8 basic; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (masked_rates & BIT(i)) continue; if (skip > 0) { skip--; continue; } basic = basic_rates & BIT(i) ? 0x80 : 0; rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); skb_put_u8(skb, basic | (u8)rate); rates--; } WARN(rates > 0, "rates confused: rates:%d, element:%d\n", rates, element_id); return 0; } int ieee80211_ave_rssi(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return 0; return -ewma_beacon_signal_read(&sdata->deflink.u.mgd.ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs) { if (!mcs) return 1; /* TODO: consider rx_highest */ if (mcs->rx_mask[3]) return 4; if (mcs->rx_mask[2]) return 3; if (mcs->rx_mask[1]) return 2; return 1; } /** * ieee80211_calculate_rx_timestamp - calculate timestamp in frame * @local: mac80211 hw info struct * @status: RX status * @mpdu_len: total MPDU length (including FCS) * @mpdu_offset: offset into MPDU to calculate timestamp at * * This function calculates the RX timestamp at the given MPDU offset, taking * into account what the RX timestamp was. An offset of 0 will just normalize * the timestamp to TSF at beginning of MPDU reception. * * Returns: the calculated timestamp */ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset) { u64 ts = status->mactime; bool mactime_plcp_start; struct rate_info ri; u16 rate; u8 n_ltf; if (WARN_ON(!ieee80211_have_rx_timestamp(status))) return 0; mactime_plcp_start = (status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_PLCP_START; memset(&ri, 0, sizeof(ri)); ri.bw = status->bw; /* Fill cfg80211 rate info */ switch (status->encoding) { case RX_ENC_EHT: ri.flags |= RATE_INFO_FLAGS_EHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.eht_ru_alloc = status->eht.ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* TODO/FIXME: is this right? handle other PPDUs */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; } break; case RX_ENC_HE: ri.flags |= RATE_INFO_FLAGS_HE_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; ri.he_ru_alloc = status->he_ru; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11ax_D6.0, section 27.3.4 for * VHT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; /* * TODO: * For HE MU PPDU, add the HE-SIG-B. * For HE ER PPDU, add 8us for the HE-SIG-A. * For HE TB PPDU, add 4us for the HE-STF. * Add the HE-LTF durations - variable. */ } break; case RX_ENC_HT: ri.mcs = status->rate_idx; ri.flags |= RATE_INFO_FLAGS_MCS; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 19.3.2 for * HT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; if (status->enc_flags & RX_ENC_FLAG_HT_GF) ts += 24; else ts += 32; /* * Add Data HT-LTFs per streams * TODO: add Extension HT-LTFs, 4us per LTF */ n_ltf = ((ri.mcs >> 3) & 3) + 1; n_ltf = n_ltf == 3 ? 4 : n_ltf; ts += n_ltf * 4; } break; case RX_ENC_VHT: ri.flags |= RATE_INFO_FLAGS_VHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* * See P802.11REVmd_D3.0, section 21.3.2 for * VHT PPDU format. */ if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; /* * Add VHT-LTFs per streams */ n_ltf = (ri.nss != 1) && (ri.nss % 2) ? ri.nss + 1 : ri.nss; ts += 4 * n_ltf; } break; default: WARN_ON(1); fallthrough; case RX_ENC_LEGACY: { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[status->band]; ri.legacy = sband->bitrates[status->rate_idx].bitrate; if (mactime_plcp_start) { if (status->band == NL80211_BAND_5GHZ) { ts += 20; mpdu_offset += 2; } else if (status->enc_flags & RX_ENC_FLAG_SHORTPRE) { ts += 96; } else { ts += 192; } } break; } } rate = cfg80211_calculate_bitrate(&ri); if (WARN_ONCE(!rate, "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n", (unsigned long long)status->flag, status->rate_idx, status->nss)) return 0; /* rewind from end of MPDU */ if ((status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_END) ts -= mpdu_len * 8 * 10 / rate; ts += mpdu_offset * 8 * 10 / rate; return ts; } /* Cancel CAC for the interfaces under the specified @local. If @ctx is * also provided, only the interfaces using that ctx will be canceled. */ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link; struct ieee80211_chanctx_conf *chanctx_conf; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; chanctx_conf = sdata_dereference(link->conf->chanctx_conf, sdata); if (ctx && &ctx->conf != chanctx_conf) continue; wiphy_delayed_work_cancel(local->hw.wiphy, &link->dfs_cac_timer_work); if (!sdata->wdev.links[link_id].cac_started) continue; chandef = link->conf->chanreq.oper; ieee80211_link_release_channel(link); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); } } } void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); struct cfg80211_chan_def chandef; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; if (!ctx->radar_detected) continue; ctx->radar_detected = false; chandef = ctx->conf.def; ieee80211_dfs_cac_cancel(local, ctx); cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL); } } static void ieee80211_radar_mark_chan_ctx_iterator(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data) { struct ieee80211_chanctx *ctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) return; if (data && data != chanctx_conf) return; ctx->radar_detected = true; } void ieee80211_radar_detected(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf) { struct ieee80211_local *local = hw_to_local(hw); trace_api_radar_detected(local); ieee80211_iter_chan_contexts_atomic(hw, ieee80211_radar_mark_chan_ctx_iterator, chanctx_conf); wiphy_work_queue(hw->wiphy, &local->radar_detected_work); } EXPORT_SYMBOL(ieee80211_radar_detected); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c, struct ieee80211_conn_settings *conn) { enum nl80211_chan_width new_primary_width; struct ieee80211_conn_settings _ignored = {}; /* allow passing NULL if caller doesn't care */ if (!conn) conn = &_ignored; again: /* no-HT indicates nothing to do */ new_primary_width = NL80211_CHAN_WIDTH_20_NOHT; switch (c->width) { default: case NL80211_CHAN_WIDTH_20_NOHT: WARN_ON_ONCE(1); fallthrough; case NL80211_CHAN_WIDTH_20: c->width = NL80211_CHAN_WIDTH_20_NOHT; conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; c->punctured = 0; break; case NL80211_CHAN_WIDTH_40: c->width = NL80211_CHAN_WIDTH_20; c->center_freq1 = c->chan->center_freq; if (conn->mode == IEEE80211_CONN_MODE_VHT) conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; c->punctured = 0; break; case NL80211_CHAN_WIDTH_80: new_primary_width = NL80211_CHAN_WIDTH_40; if (conn->mode == IEEE80211_CONN_MODE_VHT) conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_40; break; case NL80211_CHAN_WIDTH_80P80: c->center_freq2 = 0; c->width = NL80211_CHAN_WIDTH_80; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_160: new_primary_width = NL80211_CHAN_WIDTH_80; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_320: new_primary_width = NL80211_CHAN_WIDTH_160; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; break; case NL80211_CHAN_WIDTH_1: case NL80211_CHAN_WIDTH_2: case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: WARN_ON_ONCE(1); /* keep c->width */ conn->mode = IEEE80211_CONN_MODE_S1G; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); /* keep c->width */ conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; } if (new_primary_width != NL80211_CHAN_WIDTH_20_NOHT) { c->center_freq1 = cfg80211_chandef_primary(c, new_primary_width, &c->punctured); c->width = new_primary_width; } /* * With an 80 MHz channel, we might have the puncturing in the primary * 40 Mhz channel, but that's not valid when downgraded to 40 MHz width. * In that case, downgrade again. */ if (!cfg80211_chandef_valid(c) && c->punctured) goto again; WARN_ON_ONCE(!cfg80211_chandef_valid(c)); } /* * Returns true if smps_mode_new is strictly more restrictive than * smps_mode_old. */ bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old, enum ieee80211_smps_mode smps_mode_new) { if (WARN_ON_ONCE(smps_mode_old == IEEE80211_SMPS_AUTOMATIC || smps_mode_new == IEEE80211_SMPS_AUTOMATIC)) return false; switch (smps_mode_old) { case IEEE80211_SMPS_STATIC: return false; case IEEE80211_SMPS_DYNAMIC: return smps_mode_new == IEEE80211_SMPS_STATIC; case IEEE80211_SMPS_OFF: return smps_mode_new != IEEE80211_SMPS_OFF; default: WARN_ON(1); } return false; } int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings) { struct sk_buff *skb; struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.chan_switch); u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; skb = dev_alloc_skb(local->tx_headroom + hdr_len + 5 + /* channel switch announcement element */ 3 + /* secondary channel offset element */ 5 + /* wide bandwidth channel switch announcement */ 8); /* mesh channel switch parameters element */ if (!skb) return -ENOMEM; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); eth_broadcast_addr(mgmt->da); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); if (ieee80211_vif_is_mesh(&sdata->vif)) { memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); } else { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); } mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; mgmt->u.action.u.chan_switch.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; pos = skb_put(skb, 5); *pos++ = WLAN_EID_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ *pos++ = csa_settings->block_tx ? 1 : 0; /* CSA mode */ freq = csa_settings->chandef.chan->center_freq; *pos++ = ieee80211_frequency_to_channel(freq); /* channel */ *pos++ = csa_settings->count; /* count */ if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_40) { enum nl80211_channel_type ch_type; skb_put(skb, 3); *pos++ = WLAN_EID_SECONDARY_CHANNEL_OFFSET; /* EID */ *pos++ = 1; /* IE length */ ch_type = cfg80211_get_chandef_type(&csa_settings->chandef); if (ch_type == NL80211_CHAN_HT40PLUS) *pos++ = IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else *pos++ = IEEE80211_HT_PARAM_CHA_SEC_BELOW; } if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; skb_put(skb, 8); *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; /* EID */ *pos++ = 6; /* IE length */ *pos++ = sdata->u.mesh.mshcfg.dot11MeshTTL; /* Mesh TTL */ *pos = 0x00; /* Mesh Flag: Tx Restrict, Initiator, Reason */ *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR; *pos++ |= csa_settings->block_tx ? WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT : 0x00; put_unaligned_le16(WLAN_REASON_MESH_CHAN, pos); /* Reason Cd */ pos += 2; put_unaligned_le16(ifmsh->pre_value, pos);/* Precedence Value */ pos += 2; } if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_80P80 || csa_settings->chandef.width == NL80211_CHAN_WIDTH_160) { skb_put(skb, 5); ieee80211_ie_build_wide_bw_cs(pos, &csa_settings->chandef); } ieee80211_tx_skb(sdata, skb); return 0; } static bool ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i) { s32 end = data->desc[i].start + data->desc[i].duration - (tsf + 1); int skip; if (end > 0) return false; /* One shot NOA */ if (data->count[i] == 1) return false; if (data->desc[i].interval == 0) return false; /* End time is in the past, check for repetitions */ skip = DIV_ROUND_UP(-end, data->desc[i].interval); if (data->count[i] < 255) { if (data->count[i] <= skip) { data->count[i] = 0; return false; } data->count[i] -= skip; } data->desc[i].start += skip * data->desc[i].interval; return true; } static bool ieee80211_extend_absent_time(struct ieee80211_noa_data *data, u32 tsf, s32 *offset) { bool ret = false; int i; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 cur; if (!data->count[i]) continue; if (ieee80211_extend_noa_desc(data, tsf + *offset, i)) ret = true; cur = data->desc[i].start - tsf; if (cur > *offset) continue; cur = data->desc[i].start + data->desc[i].duration - tsf; if (cur > *offset) *offset = cur; } return ret; } static u32 ieee80211_get_noa_absent_time(struct ieee80211_noa_data *data, u32 tsf) { s32 offset = 0; int tries = 0; /* * arbitrary limit, used to avoid infinite loops when combined NoA * descriptors cover the full time period. */ int max_tries = 5; ieee80211_extend_absent_time(data, tsf, &offset); do { if (!ieee80211_extend_absent_time(data, tsf, &offset)) break; tries++; } while (tries < max_tries); return offset; } void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf) { u32 next_offset = BIT(31) - 1; int i; data->absent = 0; data->has_next_tsf = false; for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { s32 start; if (!data->count[i]) continue; ieee80211_extend_noa_desc(data, tsf, i); start = data->desc[i].start - tsf; if (start <= 0) data->absent |= BIT(i); if (next_offset > start) next_offset = start; data->has_next_tsf = true; } if (data->absent) next_offset = ieee80211_get_noa_absent_time(data, tsf); data->next_tsf = tsf + next_offset; } EXPORT_SYMBOL(ieee80211_update_p2p_noa); int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, struct ieee80211_noa_data *data, u32 tsf) { int ret = 0; int i; memset(data, 0, sizeof(*data)); for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) { const struct ieee80211_p2p_noa_desc *desc = &attr->desc[i]; if (!desc->count || !desc->duration) continue; data->count[i] = desc->count; data->desc[i].start = le32_to_cpu(desc->start_time); data->desc[i].duration = le32_to_cpu(desc->duration); data->desc[i].interval = le32_to_cpu(desc->interval); if (data->count[i] > 1 && data->desc[i].interval < data->desc[i].duration) continue; ieee80211_extend_noa_desc(data, tsf, i); ret++; } if (ret) ieee80211_update_p2p_noa(data, tsf); return ret; } EXPORT_SYMBOL(ieee80211_parse_p2p_noa); void ieee80211_recalc_dtim(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { u64 tsf = drv_get_tsf(local, sdata); u64 dtim_count = 0; u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; u8 dtim_period = sdata->vif.bss_conf.dtim_period; struct ps_data *ps; u8 bcns_from_dtim; if (tsf == -1ULL || !beacon_int || !dtim_period) return; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!sdata->bss) return; ps = &sdata->bss->ps; } else if (ieee80211_vif_is_mesh(&sdata->vif)) { ps = &sdata->u.mesh.ps; } else { return; } /* * actually finds last dtim_count, mac80211 will update in * __beacon_add_tim(). * dtim_count = dtim_period - (tsf / bcn_int) % dtim_period */ do_div(tsf, beacon_int); bcns_from_dtim = do_div(tsf, dtim_period); /* just had a DTIM */ if (!bcns_from_dtim) dtim_count = 0; else dtim_count = dtim_period - bcns_from_dtim; ps->dtim_count = dtim_count; } static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_link_data *link; u8 radar_detect = 0; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return 0; list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) if (link->reserved_radar_required) radar_detect |= BIT(link->reserved.oper.width); /* * An in-place reservation context should not have any assigned vifs * until it replaces the other context. */ WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && !list_empty(&ctx->assigned_links)); list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { if (!link->radar_required) continue; radar_detect |= BIT(link->conf->chanreq.oper.width); } return radar_detect; } static u32 __ieee80211_get_radio_mask(struct ieee80211_sub_if_data *sdata) { struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *conf; unsigned int link_id; u32 mask = 0; for_each_vif_active_link(&sdata->vif, link_conf, link_id) { conf = sdata_dereference(link_conf->chanctx_conf, sdata); if (!conf || conf->radio_idx < 0) continue; mask |= BIT(conf->radio_idx); } return mask; } u32 ieee80211_get_radio_mask(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return __ieee80211_get_radio_mask(sdata); } static bool ieee80211_sdata_uses_radio(struct ieee80211_sub_if_data *sdata, int radio_idx) { if (radio_idx < 0) return true; return __ieee80211_get_radio_mask(sdata) & BIT(radio_idx); } static int ieee80211_fill_ifcomb_params(struct ieee80211_local *local, struct iface_combination_params *params, const struct cfg80211_chan_def *chandef, struct ieee80211_sub_if_data *sdata) { struct ieee80211_sub_if_data *sdata_iter; struct ieee80211_chanctx *ctx; int total = !!sdata; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) continue; if (params->radio_idx >= 0 && ctx->conf.radio_idx != params->radio_idx) continue; params->radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); if (chandef && ctx->mode != IEEE80211_CHANCTX_EXCLUSIVE && cfg80211_chandef_compatible(chandef, &ctx->conf.def)) continue; params->num_different_channels++; } list_for_each_entry(sdata_iter, &local->interfaces, list) { struct wireless_dev *wdev_iter; wdev_iter = &sdata_iter->wdev; if (sdata_iter == sdata || !ieee80211_sdata_running(sdata_iter) || cfg80211_iftype_allowed(local->hw.wiphy, wdev_iter->iftype, 0, 1)) continue; if (!ieee80211_sdata_uses_radio(sdata_iter, params->radio_idx)) continue; params->iftype_num[wdev_iter->iftype]++; total++; } return total; } int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, u8 radar_detect, int radio_idx) { bool shared = chanmode == IEEE80211_CHANCTX_SHARED; struct ieee80211_local *local = sdata->local; enum nl80211_iftype iftype = sdata->wdev.iftype; struct iface_combination_params params = { .radar_detect = radar_detect, .radio_idx = radio_idx, }; int total; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(hweight32(radar_detect) > 1)) return -EINVAL; if (WARN_ON(chandef && chanmode == IEEE80211_CHANCTX_SHARED && !chandef->chan)) return -EINVAL; if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return -EINVAL; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT) { /* * always passing this is harmless, since it'll be the * same value that cfg80211 finds if it finds the same * interface ... and that's always allowed */ params.new_beacon_int = sdata->vif.bss_conf.beacon_int; } /* Always allow software iftypes */ if (cfg80211_iftype_allowed(local->hw.wiphy, iftype, 0, 1)) { if (radar_detect) return -EINVAL; return 0; } if (chandef) params.num_different_channels = 1; if (iftype != NL80211_IFTYPE_UNSPECIFIED) params.iftype_num[iftype] = 1; total = ieee80211_fill_ifcomb_params(local, &params, shared ? chandef : NULL, sdata); if (total == 1 && !params.radar_detect) return 0; return cfg80211_check_combinations(local->hw.wiphy, &params); } static void ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c, void *data) { u32 *max_num_different_channels = data; *max_num_different_channels = max(*max_num_different_channels, c->num_different_channels); } int ieee80211_max_num_channels(struct ieee80211_local *local, int radio_idx) { u32 max_num_different_channels = 1; int err; struct iface_combination_params params = { .radio_idx = radio_idx, }; lockdep_assert_wiphy(local->hw.wiphy); ieee80211_fill_ifcomb_params(local, &params, NULL, NULL); err = cfg80211_iter_combinations(local->hw.wiphy, &params, ieee80211_iter_max_chans, &max_num_different_channels); if (err < 0) return err; return max_num_different_channels; } void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_s1g_cap s1g_capab; u8 *pos; int i; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; if (!caps->s1g) return; memcpy(s1g_capab.capab_info, caps->cap, sizeof(caps->cap)); memcpy(s1g_capab.supp_mcs_nss, caps->nss_mcs, sizeof(caps->nss_mcs)); /* override the capability info */ for (i = 0; i < sizeof(ifmgd->s1g_capa.capab_info); i++) { u8 mask = ifmgd->s1g_capa_mask.capab_info[i]; s1g_capab.capab_info[i] &= ~mask; s1g_capab.capab_info[i] |= ifmgd->s1g_capa.capab_info[i] & mask; } /* then MCS and NSS set */ for (i = 0; i < sizeof(ifmgd->s1g_capa.supp_mcs_nss); i++) { u8 mask = ifmgd->s1g_capa_mask.supp_mcs_nss[i]; s1g_capab.supp_mcs_nss[i] &= ~mask; s1g_capab.supp_mcs_nss[i] |= ifmgd->s1g_capa.supp_mcs_nss[i] & mask; } pos = skb_put(skb, 2 + sizeof(s1g_capab)); *pos++ = WLAN_EID_S1G_CAPABILITIES; *pos++ = sizeof(s1g_capab); memcpy(pos, &s1g_capab, sizeof(s1g_capab)); } void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { u8 *pos = skb_put(skb, 3); *pos++ = WLAN_EID_AID_REQUEST; *pos++ = 1; *pos++ = 0; } u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) { *buf++ = WLAN_EID_VENDOR_SPECIFIC; *buf++ = 7; /* len */ *buf++ = 0x00; /* Microsoft OUI 00:50:F2 */ *buf++ = 0x50; *buf++ = 0xf2; *buf++ = 2; /* WME */ *buf++ = 0; /* WME info */ *buf++ = 1; /* WME ver */ *buf++ = qosinfo; /* U-APSD no in use */ return buf; } void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt) { struct txq_info *txqi = to_txq_info(txq); u32 frag_cnt = 0, frag_bytes = 0; struct sk_buff *skb; skb_queue_walk(&txqi->frags, skb) { frag_cnt++; frag_bytes += skb->len; } if (frame_cnt) *frame_cnt = txqi->tin.backlog_packets + frag_cnt; if (byte_cnt) *byte_cnt = txqi->tin.backlog_bytes + frag_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = { IEEE80211_WMM_IE_STA_QOSINFO_AC_VO, IEEE80211_WMM_IE_STA_QOSINFO_AC_VI, IEEE80211_WMM_IE_STA_QOSINFO_AC_BE, IEEE80211_WMM_IE_STA_QOSINFO_AC_BK }; u16 ieee80211_encode_usf(int listen_interval) { static const int listen_int_usf[] = { 1, 10, 1000, 10000 }; u16 ui, usf = 0; /* find greatest USF */ while (usf < IEEE80211_MAX_USF) { if (listen_interval % listen_int_usf[usf + 1]) break; usf += 1; } ui = listen_interval / listen_int_usf[usf]; /* error if there is a remainder. Should've been checked by user */ WARN_ON_ONCE(ui > IEEE80211_MAX_UI); listen_interval = FIELD_PREP(LISTEN_INT_USF, usf) | FIELD_PREP(LISTEN_INT_UI, ui); return (u16) listen_interval; } /* this may return more than ieee80211_put_eht_cap() will need */ u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; struct ieee80211_supported_band *sband; bool is_ap; u8 n; sband = ieee80211_get_sband(sdata); if (!sband) return 0; he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); if (!he_cap || !eht_cap) return 0; is_ap = sdata->vif.type == NL80211_IFTYPE_AP; n = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, is_ap); return 2 + 1 + sizeof(eht_cap->eht_cap_elem) + n + ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], eht_cap->eht_cap_elem.phy_cap_info); return 0; } int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn) { const struct ieee80211_sta_he_cap *he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_sta_eht_cap *eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); bool for_ap = sdata->vif.type == NL80211_IFTYPE_AP; struct ieee80211_eht_cap_elem_fixed fixed; struct ieee80211_he_cap_elem he; u8 mcs_nss_len, ppet_len; u8 orig_mcs_nss_len; u8 ie_len; if (!conn) conn = &ieee80211_conn_settings_unlimited; /* Make sure we have place for the IE */ if (!he_cap || !eht_cap) return 0; orig_mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, for_ap); ieee80211_get_adjusted_he_cap(conn, he_cap, &he); fixed = eht_cap->eht_cap_elem; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_80) fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { fixed.phy_cap_info[1] &= ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK; fixed.phy_cap_info[2] &= ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK; fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ; } if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320) { fixed.phy_cap_info[0] &= ~IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ; fixed.phy_cap_info[1] &= ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK; fixed.phy_cap_info[2] &= ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK; fixed.phy_cap_info[6] &= ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ; } if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) fixed.phy_cap_info[0] &= ~IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ; mcs_nss_len = ieee80211_eht_mcs_nss_size(&he, &fixed, for_ap); ppet_len = ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], fixed.phy_cap_info); ie_len = 2 + 1 + sizeof(eht_cap->eht_cap_elem) + mcs_nss_len + ppet_len; if (skb_tailroom(skb) < ie_len) return -ENOBUFS; skb_put_u8(skb, WLAN_EID_EXTENSION); skb_put_u8(skb, ie_len - 2); skb_put_u8(skb, WLAN_EID_EXT_EHT_CAPABILITY); skb_put_data(skb, &fixed, sizeof(fixed)); if (mcs_nss_len == 4 && orig_mcs_nss_len != 4) { /* * If the (non-AP) STA became 20 MHz only, then convert from * <=80 to 20-MHz-only format, where MCSes are indicated in * the groups 0-7, 8-9, 10-11, 12-13 rather than just 0-9, * 10-11, 12-13. Thus, use 0-9 for 0-7 and 8-9. */ skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs11_max_nss); skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs13_max_nss); } else { skb_put_data(skb, &eht_cap->eht_mcs_nss_supp, mcs_nss_len); } if (ppet_len) skb_put_data(skb, &eht_cap->eht_ppe_thres, ppet_len); return 0; } const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) { static const char * const modes[] = { [IEEE80211_CONN_MODE_S1G] = "S1G", [IEEE80211_CONN_MODE_LEGACY] = "legacy", [IEEE80211_CONN_MODE_HT] = "HT", [IEEE80211_CONN_MODE_VHT] = "VHT", [IEEE80211_CONN_MODE_HE] = "HE", [IEEE80211_CONN_MODE_EHT] = "EHT", }; if (WARN_ON(mode >= ARRAY_SIZE(modes))) return "<out of range>"; return modes[mode] ?: "<missing string>"; } enum ieee80211_conn_bw_limit ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef) { switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return IEEE80211_CONN_BW_LIMIT_20; case NL80211_CHAN_WIDTH_40: return IEEE80211_CONN_BW_LIMIT_40; case NL80211_CHAN_WIDTH_80: return IEEE80211_CONN_BW_LIMIT_80; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: return IEEE80211_CONN_BW_LIMIT_160; case NL80211_CHAN_WIDTH_320: return IEEE80211_CONN_BW_LIMIT_320; default: WARN(1, "unhandled chandef width %d\n", chandef->width); return IEEE80211_CONN_BW_LIMIT_20; } } void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) { for (int i = 0; i < 2; i++) { tpe->max_local[i].valid = false; memset(tpe->max_local[i].power, IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, sizeof(tpe->max_local[i].power)); tpe->max_reg_client[i].valid = false; memset(tpe->max_reg_client[i].power, IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT, sizeof(tpe->max_reg_client[i].power)); tpe->psd_local[i].valid = false; memset(tpe->psd_local[i].power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(tpe->psd_local[i].power)); tpe->psd_reg_client[i].valid = false; memset(tpe->psd_reg_client[i].power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(tpe->psd_reg_client[i].power)); } }
253 252 186 185 122 124 39 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 // SPDX-License-Identifier: GPL-2.0 /* * shstk.c - Intel shadow stack support * * Copyright (c) 2021, Intel Corporation. * Yu-cheng Yu <yu-cheng.yu@intel.com> */ #include <linux/sched.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/sched/signal.h> #include <linux/compat.h> #include <linux/sizes.h> #include <linux/user.h> #include <linux/syscalls.h> #include <asm/msr.h> #include <asm/fpu/xstate.h> #include <asm/fpu/types.h> #include <asm/shstk.h> #include <asm/special_insns.h> #include <asm/fpu/api.h> #include <asm/prctl.h> #define SS_FRAME_SIZE 8 static bool features_enabled(unsigned long features) { return current->thread.features & features; } static void features_set(unsigned long features) { current->thread.features |= features; } static void features_clr(unsigned long features) { current->thread.features &= ~features; } /* * Create a restore token on the shadow stack. A token is always 8-byte * and aligned to 8. */ static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) { unsigned long addr; /* Token must be aligned */ if (!IS_ALIGNED(ssp, 8)) return -EINVAL; addr = ssp - SS_FRAME_SIZE; /* * SSP is aligned, so reserved bits and mode bit are a zero, just mark * the token 64-bit. */ ssp |= BIT(0); if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) return -EFAULT; if (token_addr) *token_addr = addr; return 0; } /* * VM_SHADOW_STACK will have a guard page. This helps userspace protect * itself from attacks. The reasoning is as follows: * * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The * INCSSP instruction can increment the shadow stack pointer. It is the * shadow stack analog of an instruction like: * * addq $0x80, %rsp * * However, there is one important difference between an ADD on %rsp * and INCSSP. In addition to modifying SSP, INCSSP also reads from the * memory of the first and last elements that were "popped". It can be * thought of as acting like this: * * READ_ONCE(ssp); // read+discard top element on stack * ssp += nr_to_pop * 8; // move the shadow stack * READ_ONCE(ssp-8); // read+discard last popped stack element * * The maximum distance INCSSP can move the SSP is 2040 bytes, before * it would read the memory. Therefore a single page gap will be enough * to prevent any operation from shifting the SSP to an adjacent stack, * since it would have to land in the gap at least once, causing a * fault. */ static unsigned long alloc_shstk(unsigned long addr, unsigned long size, unsigned long token_offset, bool set_res_tok) { int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; struct mm_struct *mm = current->mm; unsigned long mapped_addr, unused; if (addr) flags |= MAP_FIXED_NOREPLACE; mmap_write_lock(mm); mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); mmap_write_unlock(mm); if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) goto out; if (create_rstor_token(mapped_addr + token_offset, NULL)) { vm_munmap(mapped_addr, size); return -EINVAL; } out: return mapped_addr; } static unsigned long adjust_shstk_size(unsigned long size) { if (size) return PAGE_ALIGN(size); return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); } static void unmap_shadow_stack(u64 base, u64 size) { int r; r = vm_munmap(base, size); /* * mmap_write_lock_killable() failed with -EINTR. This means * the process is about to die and have it's MM cleaned up. * This task shouldn't ever make it back to userspace. In this * case it is ok to leak a shadow stack, so just exit out. */ if (r == -EINTR) return; /* * For all other types of vm_munmap() failure, either the * system is out of memory or there is bug. */ WARN_ON_ONCE(r); } static int shstk_setup(void) { struct thread_shstk *shstk = &current->thread.shstk; unsigned long addr, size; /* Already enabled */ if (features_enabled(ARCH_SHSTK_SHSTK)) return 0; /* Also not supported for 32 bit */ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall()) return -EOPNOTSUPP; size = adjust_shstk_size(0); addr = alloc_shstk(0, size, 0, false); if (IS_ERR_VALUE(addr)) return PTR_ERR((void *)addr); fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, addr + size); wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); fpregs_unlock(); shstk->base = addr; shstk->size = size; features_set(ARCH_SHSTK_SHSTK); return 0; } void reset_thread_features(void) { memset(&current->thread.shstk, 0, sizeof(struct thread_shstk)); current->thread.features = 0; current->thread.features_locked = 0; } unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, unsigned long stack_size) { struct thread_shstk *shstk = &tsk->thread.shstk; unsigned long addr, size; /* * If shadow stack is not enabled on the new thread, skip any * switch to a new shadow stack. */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; /* * For CLONE_VFORK the child will share the parents shadow stack. * Make sure to clear the internal tracking of the thread shadow * stack so the freeing logic run for child knows to leave it alone. */ if (clone_flags & CLONE_VFORK) { shstk->base = 0; shstk->size = 0; return 0; } /* * For !CLONE_VM the child will use a copy of the parents shadow * stack. */ if (!(clone_flags & CLONE_VM)) return 0; size = adjust_shstk_size(stack_size); addr = alloc_shstk(0, size, 0, false); if (IS_ERR_VALUE(addr)) return addr; shstk->base = addr; shstk->size = size; return addr + size; } static unsigned long get_user_shstk_addr(void) { unsigned long long ssp; fpregs_lock_and_load(); rdmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return ssp; } #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) { if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) return -EINVAL; /* * Mark the high bit so that the sigframe can't be processed as a * return address. */ if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) return -EFAULT; return 0; } static int get_shstk_data(unsigned long *data, unsigned long __user *addr) { unsigned long ldata; if (unlikely(get_user(ldata, addr))) return -EFAULT; if (!(ldata & SHSTK_DATA_BIT)) return -EINVAL; *data = ldata & ~SHSTK_DATA_BIT; return 0; } static int shstk_push_sigframe(unsigned long *ssp) { unsigned long target_ssp = *ssp; /* Token must be aligned */ if (!IS_ALIGNED(target_ssp, 8)) return -EINVAL; *ssp -= SS_FRAME_SIZE; if (put_shstk_data((void __user *)*ssp, target_ssp)) return -EFAULT; return 0; } static int shstk_pop_sigframe(unsigned long *ssp) { struct vm_area_struct *vma; unsigned long token_addr; bool need_to_check_vma; int err = 1; /* * It is possible for the SSP to be off the end of a shadow stack by 4 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes * before it, it might be this case, so check that the address being * read is actually shadow stack. */ if (!IS_ALIGNED(*ssp, 8)) return -EINVAL; need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; if (need_to_check_vma) mmap_read_lock_killable(current->mm); err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); if (unlikely(err)) goto out_err; if (need_to_check_vma) { vma = find_vma(current->mm, *ssp); if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { err = -EFAULT; goto out_err; } mmap_read_unlock(current->mm); } /* Restore SSP aligned? */ if (unlikely(!IS_ALIGNED(token_addr, 8))) return -EINVAL; /* SSP in userspace? */ if (unlikely(token_addr >= TASK_SIZE_MAX)) return -EINVAL; *ssp = token_addr; return 0; out_err: if (need_to_check_vma) mmap_read_unlock(current->mm); return err; } int setup_signal_shadow_stack(struct ksignal *ksig) { void __user *restorer = ksig->ka.sa.sa_restorer; unsigned long ssp; int err; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return 0; if (!restorer) return -EINVAL; ssp = get_user_shstk_addr(); if (unlikely(!ssp)) return -EINVAL; err = shstk_push_sigframe(&ssp); if (unlikely(err)) return err; /* Push restorer address */ ssp -= SS_FRAME_SIZE; err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); if (unlikely(err)) return -EFAULT; fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; } int restore_signal_shadow_stack(void) { unsigned long ssp; int err; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return 0; ssp = get_user_shstk_addr(); if (unlikely(!ssp)) return -EINVAL; err = shstk_pop_sigframe(&ssp); if (unlikely(err)) return err; fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; } void shstk_free(struct task_struct *tsk) { struct thread_shstk *shstk = &tsk->thread.shstk; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return; /* * When fork() with CLONE_VM fails, the child (tsk) already has a * shadow stack allocated, and exit_thread() calls this function to * free it. In this case the parent (current) and the child share * the same mm struct. */ if (!tsk->mm || tsk->mm != current->mm) return; /* * If shstk->base is NULL, then this task is not managing its * own shadow stack (CLONE_VFORK). So skip freeing it. */ if (!shstk->base) return; /* * shstk->base is NULL for CLONE_VFORK child tasks, and so is * normal. But size = 0 on a shstk->base is not normal and * indicated an attempt to free the thread shadow stack twice. * Warn about it. */ if (WARN_ON(!shstk->size)) return; unmap_shadow_stack(shstk->base, shstk->size); shstk->size = 0; } static int wrss_control(bool enable) { u64 msrval; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; /* * Only enable WRSS if shadow stack is enabled. If shadow stack is not * enabled, WRSS will already be disabled, so don't bother clearing it * when disabling. */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return -EPERM; /* Already enabled/disabled? */ if (features_enabled(ARCH_SHSTK_WRSS) == enable) return 0; fpregs_lock_and_load(); rdmsrl(MSR_IA32_U_CET, msrval); if (enable) { features_set(ARCH_SHSTK_WRSS); msrval |= CET_WRSS_EN; } else { features_clr(ARCH_SHSTK_WRSS); if (!(msrval & CET_WRSS_EN)) goto unlock; msrval &= ~CET_WRSS_EN; } wrmsrl(MSR_IA32_U_CET, msrval); unlock: fpregs_unlock(); return 0; } static int shstk_disable(void) { if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; /* Already disabled? */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; fpregs_lock_and_load(); /* Disable WRSS too when disabling shadow stack */ wrmsrl(MSR_IA32_U_CET, 0); wrmsrl(MSR_IA32_PL3_SSP, 0); fpregs_unlock(); shstk_free(current); features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); return 0; } SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) { bool set_tok = flags & SHADOW_STACK_SET_TOKEN; unsigned long aligned_size; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; if (flags & ~SHADOW_STACK_SET_TOKEN) return -EINVAL; /* If there isn't space for a token */ if (set_tok && size < 8) return -ENOSPC; if (addr && addr < SZ_4G) return -ERANGE; /* * An overflow would result in attempting to write the restore token * to the wrong location. Not catastrophic, but just return the right * error code and block it. */ aligned_size = PAGE_ALIGN(size); if (aligned_size < size) return -EOVERFLOW; return alloc_shstk(addr, aligned_size, size, set_tok); } long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { unsigned long features = arg2; if (option == ARCH_SHSTK_STATUS) { return put_user(task->thread.features, (unsigned long __user *)arg2); } if (option == ARCH_SHSTK_LOCK) { task->thread.features_locked |= features; return 0; } /* Only allow via ptrace */ if (task != current) { if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { task->thread.features_locked &= ~features; return 0; } return -EINVAL; } /* Do not allow to change locked features */ if (features & task->thread.features_locked) return -EPERM; /* Only support enabling/disabling one feature at a time. */ if (hweight_long(features) > 1) return -EINVAL; if (option == ARCH_SHSTK_DISABLE) { if (features & ARCH_SHSTK_WRSS) return wrss_control(false); if (features & ARCH_SHSTK_SHSTK) return shstk_disable(); return -EINVAL; } /* Handle ARCH_SHSTK_ENABLE */ if (features & ARCH_SHSTK_SHSTK) return shstk_setup(); if (features & ARCH_SHSTK_WRSS) return wrss_control(true); return -EINVAL; } int shstk_update_last_frame(unsigned long val) { unsigned long ssp; if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; ssp = get_user_shstk_addr(); return write_user_shstk_64((u64 __user *)ssp, (u64)val); } bool shstk_is_enabled(void) { return features_enabled(ARCH_SHSTK_SHSTK); }
9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 // SPDX-License-Identifier: GPL-2.0-only /* * VHT handling * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2024 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "rate.h" static void __check_vhtcap_disable(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap, u32 flag) { __le32 le_flag = cpu_to_le32(flag); if (sdata->u.mgd.vht_capa_mask.vht_cap_info & le_flag && !(sdata->u.mgd.vht_capa.vht_cap_info & le_flag)) vht_cap->cap &= ~flag; } void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap) { int i; u16 rxmcs_mask, rxmcs_cap, rxmcs_n, txmcs_mask, txmcs_cap, txmcs_n; if (!vht_cap->vht_supported) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_RXLDPC); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SHORT_GI_80); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SHORT_GI_160); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_TXSTBC); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN); /* Allow user to decrease AMPDU length exponent */ if (sdata->u.mgd.vht_capa_mask.vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK)) { u32 cap, n; n = le32_to_cpu(sdata->u.mgd.vht_capa.vht_cap_info) & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; n >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; cap = vht_cap->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; cap >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; if (n < cap) { vht_cap->cap &= ~IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; vht_cap->cap |= n << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; } } /* Allow the user to decrease MCSes */ rxmcs_mask = le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.rx_mcs_map); rxmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.rx_mcs_map); rxmcs_n &= rxmcs_mask; rxmcs_cap = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); txmcs_mask = le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.tx_mcs_map); txmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.tx_mcs_map); txmcs_n &= txmcs_mask; txmcs_cap = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); for (i = 0; i < 8; i++) { u8 m, n, c; m = (rxmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; n = (rxmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; c = (rxmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { rxmcs_cap &= ~(3 << 2*i); rxmcs_cap |= (rxmcs_n & (3 << 2*i)); } m = (txmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; n = (txmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; c = (txmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { txmcs_cap &= ~(3 << 2*i); txmcs_cap |= (txmcs_n & (3 << 2*i)); } } vht_cap->vht_mcs.rx_mcs_map = cpu_to_le16(rxmcs_cap); vht_cap->vht_mcs.tx_mcs_map = cpu_to_le16(txmcs_cap); } void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta) { struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; bool have_80mhz; u32 mpdu_len; memset(vht_cap, 0, sizeof(*vht_cap)); if (!link_sta->pub->ht_cap.ht_supported) return; if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; /* Allow VHT if at least one channel on the sband supports 80 MHz */ have_80mhz = false; for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (!have_80mhz) return; /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 * and others based on the BCM4360 chipset) will unset this * capability bit when operating in 20 MHz. */ vht_cap->vht_supported = true; own_cap = sband->vht_cap; /* * If user has specified capability overrides, take care * of that if the station we're setting up is the AP that * we advertised a restricted capability set to. Override * our own capabilities and then use those below. */ if (sdata->vif.type == NL80211_IFTYPE_STATION && !test_sta_flag(link_sta->sta, WLAN_STA_TDLS_PEER)) ieee80211_apply_vhtcap_overrides(sdata, &own_cap); /* take some capabilities as-is */ cap_info = le32_to_cpu(vht_cap_ie->vht_cap_info); vht_cap->cap = cap_info; vht_cap->cap &= IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_VHT_TXOP_PS | IEEE80211_VHT_CAP_HTC_VHT | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK | IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB | IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB | IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN | IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN; vht_cap->cap |= min_t(u32, cap_info & IEEE80211_VHT_CAP_MAX_MPDU_MASK, own_cap.cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK); /* and some based on our own capabilities */ switch (own_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ; break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; break; default: /* nothing */ break; } /* symmetric capabilities */ vht_cap->cap |= cap_info & own_cap.cap & (IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160); /* remaining ones */ if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE) vht_cap->cap |= cap_info & (IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE | IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK); if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE) vht_cap->cap |= cap_info & (IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK); if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE; if (own_cap.cap & IEEE80211_VHT_CAP_TXSTBC) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_RXSTBC_MASK; if (own_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_TXSTBC; /* Copy peer MCS info, the driver might need them. */ memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs, sizeof(struct ieee80211_vht_mcs_info)); /* copy EXT_NSS_BW Support value or remove the capability */ if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_VHT_EXT_NSS_BW)) vht_cap->cap |= (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); else vht_cap->vht_mcs.tx_highest &= ~cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE); /* but also restrict MCSes */ for (i = 0; i < 8; i++) { u16 own_rx, own_tx, peer_rx, peer_tx; own_rx = le16_to_cpu(own_cap.vht_mcs.rx_mcs_map); own_rx = (own_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; own_tx = le16_to_cpu(own_cap.vht_mcs.tx_mcs_map); own_tx = (own_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; peer_rx = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); peer_rx = (peer_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; peer_tx = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); peer_tx = (peer_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (peer_tx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { if (own_rx == IEEE80211_VHT_MCS_NOT_SUPPORTED) peer_tx = IEEE80211_VHT_MCS_NOT_SUPPORTED; else if (own_rx < peer_tx) peer_tx = own_rx; } if (peer_rx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { if (own_tx == IEEE80211_VHT_MCS_NOT_SUPPORTED) peer_rx = IEEE80211_VHT_MCS_NOT_SUPPORTED; else if (own_tx < peer_rx) peer_rx = own_tx; } vht_cap->vht_mcs.rx_mcs_map &= ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); vht_cap->vht_mcs.rx_mcs_map |= cpu_to_le16(peer_rx << i * 2); vht_cap->vht_mcs.tx_mcs_map &= ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2); } /* * This is a workaround for VHT-enabled STAs which break the spec * and have the VHT-MCS Rx map filled in with value 3 for all eight * spatial streams, an example is AR9462. * * As per spec, in section 22.1.1 Introduction to the VHT PHY * A VHT STA shall support at least single spatial stream VHT-MCSs * 0 to 7 (transmit and receive) in all supported channel widths. */ if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) { vht_cap->vht_supported = false; sdata_info(sdata, "Ignoring VHT IE from %pM (link:%pM) due to invalid rx_mcs_map\n", link_sta->sta->addr, link_sta->addr); return; } /* finally set up the bandwidth */ switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; default: link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; if (!(vht_cap->vht_mcs.tx_highest & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) break; /* * If this is non-zero, then it does support 160 MHz after all, * in one form or the other. We don't distinguish here (or even * above) between 160 and 80+80 yet. */ if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; } link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta); /* * Work around the Cisco 9115 FW 17.3 bug by taking the min of * both reported MPDU lengths. */ mpdu_len = vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK; if (vht_cap_ie2) mpdu_len = min_t(u32, mpdu_len, le32_get_bits(vht_cap_ie2->vht_cap_info, IEEE80211_VHT_CAP_MAX_MPDU_MASK)); /* * FIXME - should the amsdu len be per link? store per link * and maintain a minimum? */ switch (mpdu_len) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: default: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; break; } ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); } /* FIXME: move this to some better location - parses HE/EHT now */ enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef) { unsigned int link_id = link_sta->link_id; struct ieee80211_sub_if_data *sdata = link_sta->sta->sdata; struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap; struct ieee80211_sta_eht_cap *eht_cap = &link_sta->pub->eht_cap; u32 cap_width; if (he_cap->has_he) { enum nl80211_band band; u8 info; if (chandef) { band = chandef->chan->band; } else { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); band = link_conf->chanreq.oper.chan->band; rcu_read_unlock(); } if (eht_cap->has_eht && band == NL80211_BAND_6GHZ) { info = eht_cap->eht_cap_elem.phy_cap_info[0]; if (info & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) return IEEE80211_STA_RX_BW_320; } info = he_cap->he_cap_elem.phy_cap_info[0]; if (band == NL80211_BAND_2GHZ) { if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) return IEEE80211_STA_RX_BW_40; return IEEE80211_STA_RX_BW_20; } if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G || info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) return IEEE80211_STA_RX_BW_160; if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) return IEEE80211_STA_RX_BW_80; return IEEE80211_STA_RX_BW_20; } if (!vht_cap->vht_supported) return link_sta->pub->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ || cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return IEEE80211_STA_RX_BW_160; /* * If this is non-zero, then it does support 160 MHz after all, * in one form or the other. We don't distinguish here (or even * above) between 160 and 80+80 yet. */ if (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) return IEEE80211_STA_RX_BW_160; return IEEE80211_STA_RX_BW_80; } enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta) { struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; u32 cap_width; if (!vht_cap->vht_supported) { if (!link_sta->pub->ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; return link_sta->pub->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20; } cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) return NL80211_CHAN_WIDTH_160; else if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return NL80211_CHAN_WIDTH_80P80; return NL80211_CHAN_WIDTH_80; } enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *link_sta) { enum ieee80211_sta_rx_bandwidth cur_bw = link_sta->pub->bandwidth; struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; u32 cap_width; switch (cur_bw) { case IEEE80211_STA_RX_BW_20: if (!link_sta->pub->ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; else return NL80211_CHAN_WIDTH_20; case IEEE80211_STA_RX_BW_40: return NL80211_CHAN_WIDTH_40; case IEEE80211_STA_RX_BW_80: return NL80211_CHAN_WIDTH_80; case IEEE80211_STA_RX_BW_160: cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) return NL80211_CHAN_WIDTH_160; return NL80211_CHAN_WIDTH_80P80; default: return NL80211_CHAN_WIDTH_20; } } /* FIXME: rename/move - this deals with everything not just VHT */ enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef) { struct sta_info *sta = link_sta->sta; enum nl80211_chan_width bss_width; enum ieee80211_sta_rx_bandwidth bw; if (chandef) { bss_width = chandef->width; } else { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(sta->sdata->vif.link_conf[link_sta->link_id]); if (WARN_ON_ONCE(!link_conf)) { rcu_read_unlock(); return IEEE80211_STA_RX_BW_20; } bss_width = link_conf->chanreq.oper.width; rcu_read_unlock(); } bw = _ieee80211_sta_cap_rx_bw(link_sta, chandef); bw = min(bw, link_sta->cur_max_bandwidth); /* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of * IEEE80211-2016 specification makes higher bandwidth operation * possible on the TDLS link if the peers have wider bandwidth * capability. * * However, in this case, and only if the TDLS peer is authorized, * limit to the tdls_chandef so that the configuration here isn't * wider than what's actually requested on the channel context. */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) && test_sta_flag(sta, WLAN_STA_AUTHORIZED) && sta->tdls_chandef.chan) bw = min(bw, ieee80211_chan_width_to_rx_bw(sta->tdls_chandef.width)); else bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; } void ieee80211_sta_init_nss(struct link_sta_info *link_sta) { u8 ht_rx_nss = 0, vht_rx_nss = 0, he_rx_nss = 0, eht_rx_nss = 0, rx_nss; bool support_160; if (link_sta->pub->eht_cap.has_eht) { int i; const u8 *rx_nss_mcs = (void *)&link_sta->pub->eht_cap.eht_mcs_nss_supp; /* get the max nss for EHT over all possible bandwidths and mcs */ for (i = 0; i < sizeof(struct ieee80211_eht_mcs_nss_supp); i++) eht_rx_nss = max_t(u8, eht_rx_nss, u8_get_bits(rx_nss_mcs[i], IEEE80211_EHT_MCS_NSS_RX)); } if (link_sta->pub->he_cap.has_he) { int i; u8 rx_mcs_80 = 0, rx_mcs_160 = 0; const struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap; u16 mcs_160_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160); u16 mcs_80_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80); for (i = 7; i >= 0; i--) { u8 mcs_160 = (mcs_160_map >> (2 * i)) & 3; if (mcs_160 != IEEE80211_HE_MCS_NOT_SUPPORTED) { rx_mcs_160 = i + 1; break; } } for (i = 7; i >= 0; i--) { u8 mcs_80 = (mcs_80_map >> (2 * i)) & 3; if (mcs_80 != IEEE80211_HE_MCS_NOT_SUPPORTED) { rx_mcs_80 = i + 1; break; } } support_160 = he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; if (support_160) he_rx_nss = min(rx_mcs_80, rx_mcs_160); else he_rx_nss = rx_mcs_80; } if (link_sta->pub->ht_cap.ht_supported) { if (link_sta->pub->ht_cap.mcs.rx_mask[0]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[1]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[2]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[3]) ht_rx_nss++; /* FIXME: consider rx_highest? */ } if (link_sta->pub->vht_cap.vht_supported) { int i; u16 rx_mcs_map; rx_mcs_map = le16_to_cpu(link_sta->pub->vht_cap.vht_mcs.rx_mcs_map); for (i = 7; i >= 0; i--) { u8 mcs = (rx_mcs_map >> (2 * i)) & 3; if (mcs != IEEE80211_VHT_MCS_NOT_SUPPORTED) { vht_rx_nss = i + 1; break; } } /* FIXME: consider rx_highest? */ } rx_nss = max(vht_rx_nss, ht_rx_nss); rx_nss = max(he_rx_nss, rx_nss); rx_nss = max(eht_rx_nss, rx_nss); rx_nss = max_t(u8, 1, rx_nss); link_sta->capa_nss = rx_nss; /* that shouldn't be set yet, but we can handle it anyway */ if (link_sta->op_mode_nss) link_sta->pub->rx_nss = min_t(u8, rx_nss, link_sta->op_mode_nss); else link_sta->pub->rx_nss = rx_nss; } u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *link_sta, u8 opmode, enum nl80211_band band) { enum ieee80211_sta_rx_bandwidth new_bw; struct sta_opmode_info sta_opmode = {}; u32 changed = 0; u8 nss; /* ignore - no support for BF yet */ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) return 0; nss = opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; if (link_sta->op_mode_nss != nss) { if (nss <= link_sta->capa_nss) { link_sta->op_mode_nss = nss; if (nss != link_sta->pub->rx_nss) { link_sta->pub->rx_nss = nss; changed |= IEEE80211_RC_NSS_CHANGED; sta_opmode.rx_nss = link_sta->pub->rx_nss; sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; } } else { pr_warn_ratelimited("Ignoring NSS change in VHT Operating Mode Notification from %pM with invalid nss %d", link_sta->pub->addr, nss); } } switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; else link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: /* legacy only, no longer used by newer spec */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } new_bw = ieee80211_sta_cur_vht_bw(link_sta); if (new_bw != link_sta->pub->bandwidth) { link_sta->pub->bandwidth = new_bw; sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(link_sta); changed |= IEEE80211_RC_BW_CHANGED; sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED; } if (sta_opmode.changed) cfg80211_sta_opmode_change_notify(sdata->dev, link_sta->addr, &sta_opmode, GFP_KERNEL); return changed; } void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ieee80211_mgmt *mgmt) { struct ieee80211_bss_conf *link_conf = link->conf; if (!link_conf->mu_mimo_owner) return; if (!memcmp(mgmt->u.action.u.vht_group_notif.position, link_conf->mu_group.position, WLAN_USER_POSITION_LEN) && !memcmp(mgmt->u.action.u.vht_group_notif.membership, link_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN)) return; memcpy(link_conf->mu_group.membership, mgmt->u.action.u.vht_group_notif.membership, WLAN_MEMBERSHIP_LEN); memcpy(link_conf->mu_group.position, mgmt->u.action.u.vht_group_notif.position, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_MU_GROUPS); } void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id, const u8 *membership, const u8 *position) { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(vif->link_conf[link_id]); if (!WARN_ON_ONCE(!link_conf || !link_conf->mu_mimo_owner)) { memcpy(link_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN); memcpy(link_conf->mu_group.position, position, WLAN_USER_POSITION_LEN); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *link_sta, u8 opmode, enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; u32 changed = __ieee80211_vht_handle_opmode(sdata, link_sta, opmode, band); if (changed > 0) { ieee80211_recalc_min_chandef(sdata, link_sta->link_id); rate_control_rate_update(local, sband, link_sta, changed); } } void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]) { int i; u16 mask, cap = le16_to_cpu(vht_cap); for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { mask = (cap >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; switch (mask) { case IEEE80211_VHT_MCS_SUPPORT_0_7: vht_mask[i] = 0x00FF; break; case IEEE80211_VHT_MCS_SUPPORT_0_8: vht_mask[i] = 0x01FF; break; case IEEE80211_VHT_MCS_SUPPORT_0_9: vht_mask[i] = 0x03FF; break; case IEEE80211_VHT_MCS_NOT_SUPPORTED: default: vht_mask[i] = 0; break; } } }
57 57 112 112 89 87 87 111 111 57 99 99 99 87 86 2 2 2 1 17 17 30 87 86 57 57 30 31 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 /* * net/tipc/server.c: TIPC server infrastructure * * Copyright (c) 2012-2013, Wind River Systems * Copyright (c) 2017-2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "subscr.h" #include "topsrv.h" #include "core.h" #include "socket.h" #include "addr.h" #include "msg.h" #include "bearer.h" #include <net/sock.h> #include <linux/module.h> #include <trace/events/sock.h> /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 #define MAX_RECV_MSG_COUNT 25 #define CF_CONNECTED 1 #define TIPC_SERVER_NAME_LEN 32 /** * struct tipc_topsrv - TIPC server structure * @conn_idr: identifier set of connection * @idr_lock: protect the connection identifier set * @idr_in_use: amount of allocated identifier entry * @net: network namspace instance * @awork: accept work item * @rcv_wq: receive workqueue * @send_wq: send workqueue * @listener: topsrv listener socket * @name: server name */ struct tipc_topsrv { struct idr conn_idr; spinlock_t idr_lock; /* for idr list */ int idr_in_use; struct net *net; struct work_struct awork; struct workqueue_struct *rcv_wq; struct workqueue_struct *send_wq; struct socket *listener; char name[TIPC_SERVER_NAME_LEN]; }; /** * struct tipc_conn - TIPC connection structure * @kref: reference counter to connection object * @conid: connection identifier * @sock: socket handler associated with connection * @flags: indicates connection state * @server: pointer to connected server * @sub_list: lsit to all pertaing subscriptions * @sub_lock: lock protecting the subscription list * @rwork: receive work item * @outqueue: pointer to first outbound message in queue * @outqueue_lock: control access to the outqueue * @swork: send work item */ struct tipc_conn { struct kref kref; int conid; struct socket *sock; unsigned long flags; struct tipc_topsrv *server; struct list_head sub_list; spinlock_t sub_lock; /* for subscription list */ struct work_struct rwork; struct list_head outqueue; spinlock_t outqueue_lock; /* for outqueue */ struct work_struct swork; }; /* An entry waiting to be sent */ struct outqueue_entry { bool inactive; struct tipc_event evt; struct list_head list; }; static void tipc_conn_recv_work(struct work_struct *work); static void tipc_conn_send_work(struct work_struct *work); static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt); static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s); static bool connected(struct tipc_conn *con) { return con && test_bit(CF_CONNECTED, &con->flags); } static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); struct tipc_topsrv *s = con->server; struct outqueue_entry *e, *safe; spin_lock_bh(&s->idr_lock); idr_remove(&s->conn_idr, con->conid); s->idr_in_use--; spin_unlock_bh(&s->idr_lock); if (con->sock) sock_release(con->sock); spin_lock_bh(&con->outqueue_lock); list_for_each_entry_safe(e, safe, &con->outqueue, list) { list_del(&e->list); kfree(e); } spin_unlock_bh(&con->outqueue_lock); kfree(con); } static void conn_put(struct tipc_conn *con) { kref_put(&con->kref, tipc_conn_kref_release); } static void conn_get(struct tipc_conn *con) { kref_get(&con->kref); } static void tipc_conn_close(struct tipc_conn *con) { struct sock *sk = con->sock->sk; bool disconnect = false; write_lock_bh(&sk->sk_callback_lock); disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags); if (disconnect) { sk->sk_user_data = NULL; tipc_conn_delete_sub(con, NULL); } write_unlock_bh(&sk->sk_callback_lock); /* Handle concurrent calls from sending and receiving threads */ if (!disconnect) return; /* Don't flush pending works, -just let them expire */ kernel_sock_shutdown(con->sock, SHUT_RDWR); conn_put(con); } static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *sock) { struct tipc_conn *con; int ret; con = kzalloc(sizeof(*con), GFP_ATOMIC); if (!con) return ERR_PTR(-ENOMEM); kref_init(&con->kref); INIT_LIST_HEAD(&con->outqueue); INIT_LIST_HEAD(&con->sub_list); spin_lock_init(&con->outqueue_lock); spin_lock_init(&con->sub_lock); INIT_WORK(&con->swork, tipc_conn_send_work); INIT_WORK(&con->rwork, tipc_conn_recv_work); spin_lock_bh(&s->idr_lock); ret = idr_alloc(&s->conn_idr, con, 0, 0, GFP_ATOMIC); if (ret < 0) { kfree(con); spin_unlock_bh(&s->idr_lock); return ERR_PTR(-ENOMEM); } con->conid = ret; s->idr_in_use++; set_bit(CF_CONNECTED, &con->flags); con->server = s; con->sock = sock; conn_get(con); spin_unlock_bh(&s->idr_lock); return con; } static struct tipc_conn *tipc_conn_lookup(struct tipc_topsrv *s, int conid) { struct tipc_conn *con; spin_lock_bh(&s->idr_lock); con = idr_find(&s->conn_idr, conid); if (!connected(con) || !kref_get_unless_zero(&con->kref)) con = NULL; spin_unlock_bh(&s->idr_lock); return con; } /* tipc_conn_delete_sub - delete a specific or all subscriptions * for a given subscriber */ static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) { struct tipc_net *tn = tipc_net(con->server->net); struct list_head *sub_list = &con->sub_list; struct tipc_subscription *sub, *tmp; spin_lock_bh(&con->sub_lock); list_for_each_entry_safe(sub, tmp, sub_list, sub_list) { if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) { tipc_sub_unsubscribe(sub); atomic_dec(&tn->subscription_count); if (s) break; } } spin_unlock_bh(&con->sub_lock); } static void tipc_conn_send_to_sock(struct tipc_conn *con) { struct list_head *queue = &con->outqueue; struct tipc_topsrv *srv = con->server; struct outqueue_entry *e; struct tipc_event *evt; struct msghdr msg; struct kvec iov; int count = 0; int ret; spin_lock_bh(&con->outqueue_lock); while (!list_empty(queue)) { e = list_first_entry(queue, struct outqueue_entry, list); evt = &e->evt; spin_unlock_bh(&con->outqueue_lock); if (e->inactive) tipc_conn_delete_sub(con, &evt->s); memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; iov.iov_base = evt; iov.iov_len = sizeof(*evt); msg.msg_name = NULL; if (con->sock) { ret = kernel_sendmsg(con->sock, &msg, &iov, 1, sizeof(*evt)); if (ret == -EWOULDBLOCK || ret == 0) { cond_resched(); return; } else if (ret < 0) { return tipc_conn_close(con); } } else { tipc_topsrv_kern_evt(srv->net, evt); } /* Don't starve users filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { cond_resched(); count = 0; } spin_lock_bh(&con->outqueue_lock); list_del(&e->list); kfree(e); } spin_unlock_bh(&con->outqueue_lock); } static void tipc_conn_send_work(struct work_struct *work) { struct tipc_conn *con = container_of(work, struct tipc_conn, swork); if (connected(con)) tipc_conn_send_to_sock(con); conn_put(con); } /* tipc_topsrv_queue_evt() - interrupt level call from a subscription instance * The queued work is launched into tipc_conn_send_work()->tipc_conn_send_to_sock() */ void tipc_topsrv_queue_evt(struct net *net, int conid, u32 event, struct tipc_event *evt) { struct tipc_topsrv *srv = tipc_topsrv(net); struct outqueue_entry *e; struct tipc_conn *con; con = tipc_conn_lookup(srv, conid); if (!con) return; if (!connected(con)) goto err; e = kmalloc(sizeof(*e), GFP_ATOMIC); if (!e) goto err; e->inactive = (event == TIPC_SUBSCR_TIMEOUT); memcpy(&e->evt, evt, sizeof(*evt)); spin_lock_bh(&con->outqueue_lock); list_add_tail(&e->list, &con->outqueue); spin_unlock_bh(&con->outqueue_lock); if (queue_work(srv->send_wq, &con->swork)) return; err: conn_put(con); } /* tipc_conn_write_space - interrupt callback after a sendmsg EAGAIN * Indicates that there now is more space in the send buffer * The queued work is launched into tipc_send_work()->tipc_conn_send_to_sock() */ static void tipc_conn_write_space(struct sock *sk) { struct tipc_conn *con; read_lock_bh(&sk->sk_callback_lock); con = sk->sk_user_data; if (connected(con)) { conn_get(con); if (!queue_work(con->server->send_wq, &con->swork)) conn_put(con); } read_unlock_bh(&sk->sk_callback_lock); } static int tipc_conn_rcv_sub(struct tipc_topsrv *srv, struct tipc_conn *con, struct tipc_subscr *s) { struct tipc_net *tn = tipc_net(srv->net); struct tipc_subscription *sub; u32 s_filter = tipc_sub_read(s, filter); if (s_filter & TIPC_SUB_CANCEL) { tipc_sub_write(s, filter, s_filter & ~TIPC_SUB_CANCEL); tipc_conn_delete_sub(con, s); return 0; } if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) { pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR); return -1; } sub = tipc_sub_subscribe(srv->net, s, con->conid); if (!sub) return -1; atomic_inc(&tn->subscription_count); spin_lock_bh(&con->sub_lock); list_add(&sub->sub_list, &con->sub_list); spin_unlock_bh(&con->sub_lock); return 0; } static int tipc_conn_rcv_from_sock(struct tipc_conn *con) { struct tipc_topsrv *srv = con->server; struct sock *sk = con->sock->sk; struct msghdr msg = {}; struct tipc_subscr s; struct kvec iov; int ret; iov.iov_base = &s; iov.iov_len = sizeof(s); msg.msg_name = NULL; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, iov.iov_len); ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); if (ret == -EWOULDBLOCK) return -EWOULDBLOCK; if (ret == sizeof(s)) { read_lock_bh(&sk->sk_callback_lock); /* RACE: the connection can be closed in the meantime */ if (likely(connected(con))) ret = tipc_conn_rcv_sub(srv, con, &s); read_unlock_bh(&sk->sk_callback_lock); if (!ret) return 0; } tipc_conn_close(con); return ret; } static void tipc_conn_recv_work(struct work_struct *work) { struct tipc_conn *con = container_of(work, struct tipc_conn, rwork); int count = 0; while (connected(con)) { if (tipc_conn_rcv_from_sock(con)) break; /* Don't flood Rx machine */ if (++count >= MAX_RECV_MSG_COUNT) { cond_resched(); count = 0; } } conn_put(con); } /* tipc_conn_data_ready - interrupt callback indicating the socket has data * The queued work is launched into tipc_recv_work()->tipc_conn_rcv_from_sock() */ static void tipc_conn_data_ready(struct sock *sk) { struct tipc_conn *con; trace_sk_data_ready(sk); read_lock_bh(&sk->sk_callback_lock); con = sk->sk_user_data; if (connected(con)) { conn_get(con); if (!queue_work(con->server->rcv_wq, &con->rwork)) conn_put(con); } read_unlock_bh(&sk->sk_callback_lock); } static void tipc_topsrv_accept(struct work_struct *work) { struct tipc_topsrv *srv = container_of(work, struct tipc_topsrv, awork); struct socket *newsock, *lsock; struct tipc_conn *con; struct sock *newsk; int ret; spin_lock_bh(&srv->idr_lock); if (!srv->listener) { spin_unlock_bh(&srv->idr_lock); return; } lsock = srv->listener; spin_unlock_bh(&srv->idr_lock); while (1) { ret = kernel_accept(lsock, &newsock, O_NONBLOCK); if (ret < 0) return; con = tipc_conn_alloc(srv, newsock); if (IS_ERR(con)) { ret = PTR_ERR(con); sock_release(newsock); return; } /* Register callbacks */ newsk = newsock->sk; write_lock_bh(&newsk->sk_callback_lock); newsk->sk_data_ready = tipc_conn_data_ready; newsk->sk_write_space = tipc_conn_write_space; newsk->sk_user_data = con; write_unlock_bh(&newsk->sk_callback_lock); /* Wake up receive process in case of 'SYN+' message */ newsk->sk_data_ready(newsk); conn_put(con); } } /* tipc_topsrv_listener_data_ready - interrupt callback with connection request * The queued job is launched into tipc_topsrv_accept() */ static void tipc_topsrv_listener_data_ready(struct sock *sk) { struct tipc_topsrv *srv; trace_sk_data_ready(sk); read_lock_bh(&sk->sk_callback_lock); srv = sk->sk_user_data; if (srv) queue_work(srv->rcv_wq, &srv->awork); read_unlock_bh(&sk->sk_callback_lock); } static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) { struct socket *lsock = NULL; struct sockaddr_tipc saddr; struct sock *sk; int rc; rc = sock_create_kern(srv->net, AF_TIPC, SOCK_SEQPACKET, 0, &lsock); if (rc < 0) return rc; srv->listener = lsock; sk = lsock->sk; write_lock_bh(&sk->sk_callback_lock); sk->sk_data_ready = tipc_topsrv_listener_data_ready; sk->sk_user_data = srv; write_unlock_bh(&sk->sk_callback_lock); lock_sock(sk); rc = tsk_set_importance(sk, TIPC_CRITICAL_IMPORTANCE); release_sock(sk); if (rc < 0) goto err; saddr.family = AF_TIPC; saddr.addrtype = TIPC_SERVICE_RANGE; saddr.addr.nameseq.type = TIPC_TOP_SRV; saddr.addr.nameseq.lower = TIPC_TOP_SRV; saddr.addr.nameseq.upper = TIPC_TOP_SRV; saddr.scope = TIPC_NODE_SCOPE; rc = tipc_sk_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); if (rc < 0) goto err; rc = kernel_listen(lsock, 0); if (rc < 0) goto err; /* As server's listening socket owner and creator is the same module, * we have to decrease TIPC module reference count to guarantee that * it remains zero after the server socket is created, otherwise, * executing "rmmod" command is unable to make TIPC module deleted * after TIPC module is inserted successfully. * * However, the reference count is ever increased twice in * sock_create_kern(): one is to increase the reference count of owner * of TIPC socket's proto_ops struct; another is to increment the * reference count of owner of TIPC proto struct. Therefore, we must * decrement the module reference count twice to ensure that it keeps * zero after server's listening socket is created. Of course, we * must bump the module reference count twice as well before the socket * is closed. */ module_put(lsock->ops->owner); module_put(sk->sk_prot_creator->owner); return 0; err: sock_release(lsock); return -EINVAL; } bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, u32 upper, u32 filter, int *conid) { struct tipc_subscr sub; struct tipc_conn *con; int rc; sub.seq.type = type; sub.seq.lower = lower; sub.seq.upper = upper; sub.timeout = TIPC_WAIT_FOREVER; sub.filter = filter; *(u64 *)&sub.usr_handle = (u64)port; con = tipc_conn_alloc(tipc_topsrv(net), NULL); if (IS_ERR(con)) return false; *conid = con->conid; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); if (rc) conn_put(con); conn_put(con); return !rc; } void tipc_topsrv_kern_unsubscr(struct net *net, int conid) { struct tipc_conn *con; con = tipc_conn_lookup(tipc_topsrv(net), conid); if (!con) return; test_and_clear_bit(CF_CONNECTED, &con->flags); tipc_conn_delete_sub(con, NULL); conn_put(con); conn_put(con); } static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt) { u32 port = *(u32 *)&evt->s.usr_handle; u32 self = tipc_own_addr(net); struct sk_buff_head evtq; struct sk_buff *skb; skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt), self, self, port, port, 0); if (!skb) return; msg_set_dest_droppable(buf_msg(skb), true); memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); skb_queue_head_init(&evtq); __skb_queue_tail(&evtq, skb); tipc_loopback_trace(net, &evtq); tipc_sk_rcv(net, &evtq); } static int tipc_topsrv_work_start(struct tipc_topsrv *s) { s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0); if (!s->rcv_wq) { pr_err("can't start tipc receive workqueue\n"); return -ENOMEM; } s->send_wq = alloc_ordered_workqueue("tipc_send", 0); if (!s->send_wq) { pr_err("can't start tipc send workqueue\n"); destroy_workqueue(s->rcv_wq); return -ENOMEM; } return 0; } static void tipc_topsrv_work_stop(struct tipc_topsrv *s) { destroy_workqueue(s->rcv_wq); destroy_workqueue(s->send_wq); } static int tipc_topsrv_start(struct net *net) { struct tipc_net *tn = tipc_net(net); const char name[] = "topology_server"; struct tipc_topsrv *srv; int ret; srv = kzalloc(sizeof(*srv), GFP_ATOMIC); if (!srv) return -ENOMEM; srv->net = net; INIT_WORK(&srv->awork, tipc_topsrv_accept); strscpy(srv->name, name, sizeof(srv->name)); tn->topsrv = srv; atomic_set(&tn->subscription_count, 0); spin_lock_init(&srv->idr_lock); idr_init(&srv->conn_idr); srv->idr_in_use = 0; ret = tipc_topsrv_work_start(srv); if (ret < 0) goto err_start; ret = tipc_topsrv_create_listener(srv); if (ret < 0) goto err_create; return 0; err_create: tipc_topsrv_work_stop(srv); err_start: kfree(srv); return ret; } static void tipc_topsrv_stop(struct net *net) { struct tipc_topsrv *srv = tipc_topsrv(net); struct socket *lsock = srv->listener; struct tipc_conn *con; int id; spin_lock_bh(&srv->idr_lock); for (id = 0; srv->idr_in_use; id++) { con = idr_find(&srv->conn_idr, id); if (con) { spin_unlock_bh(&srv->idr_lock); tipc_conn_close(con); spin_lock_bh(&srv->idr_lock); } } __module_get(lsock->ops->owner); __module_get(lsock->sk->sk_prot_creator->owner); srv->listener = NULL; spin_unlock_bh(&srv->idr_lock); tipc_topsrv_work_stop(srv); sock_release(lsock); idr_destroy(&srv->conn_idr); kfree(srv); } int __net_init tipc_topsrv_init_net(struct net *net) { return tipc_topsrv_start(net); } void __net_exit tipc_topsrv_exit_net(struct net *net) { tipc_topsrv_stop(net); }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 // SPDX-License-Identifier: GPL-2.0 #include <linux/utsname.h> #include <net/cfg80211.h> #include "core.h" #include "rdev-ops.h" void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct device *pdev = wiphy_dev(wdev->wiphy); if (pdev->driver) strscpy(info->driver, pdev->driver->name, sizeof(info->driver)); else strscpy(info->driver, "N/A", sizeof(info->driver)); strscpy(info->version, init_utsname()->release, sizeof(info->version)); if (wdev->wiphy->fw_version[0]) strscpy(info->fw_version, wdev->wiphy->fw_version, sizeof(info->fw_version)); else strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)), sizeof(info->bus_info)); } EXPORT_SYMBOL(cfg80211_get_drvinfo);
5204 4391 315 4717 4446 4391 1 209 209 209 209 4717 1 1 5962 66 1 1 5239 1 72 6 409 1 313 5050 48 5072 4471 4476 4479 4147 4450 4473 3425 3441 3432 3430 3441 3429 3429 3442 313 323 316 68 459 8 12 1 1 1 1 1 243 242 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* memcontrol.h - Memory Controller * * Copyright IBM Corporation, 2007 * Author Balbir Singh <balbir@linux.vnet.ibm.com> * * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> */ #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H #include <linux/cgroup.h> #include <linux/vm_event_item.h> #include <linux/hardirq.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/page_counter.h> #include <linux/vmpressure.h> #include <linux/eventfd.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/writeback.h> #include <linux/page-flags.h> #include <linux/shrinker.h> struct mem_cgroup; struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, MEMCG_NR_STAT, }; enum memcg_memory_event { MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int generation; }; #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 struct mem_cgroup_id { int id; refcount_t ref; }; struct memcg_vmstats_percpu; struct memcg1_events_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; struct lruvec_stats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ atomic_t generation; }; /* * per-node information in memory controller. */ struct mem_cgroup_per_node { /* Keep the read-only fields at the start */ struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats *lruvec_stats; struct shrinker_info __rcu *shrinker_info; #ifdef CONFIG_MEMCG_V1 /* * Memcg-v1 only stuff in middle as buffer between read mostly fields * and update often fields to avoid false sharing. If v1 stuff is * not present, an explicit padding is needed. */ struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; #else CACHELINE_PADDING(_pad1_); #endif /* Fields which get updated often at the end. */ struct lruvec lruvec; CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; }; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; unsigned long threshold; }; /* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below or equal to usage. */ int current_threshold; /* Size of entries[] */ unsigned int size; /* Array of thresholds */ struct mem_cgroup_threshold entries[] __counted_by(size); }; struct mem_cgroup_thresholds { /* Primary thresholds array */ struct mem_cgroup_threshold_ary *primary; /* * Spare threshold array. * This is needed to make mem_cgroup_unregister_event() "never fail". * It must be able to store at least primary->size - 1 entries. */ struct mem_cgroup_threshold_ary *spare; }; /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss * one in a given round, we're likely to catch it later if it keeps * foreign-dirtying, so a fairly low count should be enough. * * See mem_cgroup_track_foreign_dirty_slowpath() for details. */ #define MEMCG_CGWB_FRN_CNT 4 struct memcg_cgwb_frn { u64 bdi_id; /* bdi->id of the foreign inode */ int memcg_id; /* memcg->css.id of foreign inode */ u64 at; /* jiffies_64 at the time of dirtying */ struct wb_completion done; /* tracks in-flight foreign writebacks */ }; /* * Bucket for arbitrarily byte-sized objects charged to a memory * cgroup. The bucket can be reparented in one piece when the cgroup * is destroyed, without having to round up the individual references * of all live memory objects in the wild. */ struct obj_cgroup { struct percpu_ref refcnt; struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. */ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ struct mem_cgroup_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ union { struct page_counter swap; /* v2 only */ struct page_counter memsw; /* v1 only */ }; /* registered local peak watchers */ struct list_head memory_peaks; struct list_head swap_peaks; spinlock_t peaks_lock; /* Range enforcement for interrupt charges */ struct work_struct high_work; #ifdef CONFIG_ZSWAP unsigned long zswap_max; /* * Prevent pages from this memcg from being written back from zswap to * swap, and from being swapped out on zswap store failures. */ bool zswap_writeback; #endif /* vmpressure notifications */ struct vmpressure vmpressure; /* * Should the OOM killer kill all belonging tasks, had it kill one? */ bool oom_group; int swappiness; /* memory.events and memory.events.local */ struct cgroup_file events_file; struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; /* memory.stat */ struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; /* * Hint of reclaim pressure for socket memroy management. Note * that this indicator should NOT be used in legacy cgroup mode * where socket memory is accounted/charged separately. */ unsigned long socket_pressure; int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting * process. memcg->orig_objcg preserves a pointer (and a reference) * to the original objcg until the end of live of memcg. */ struct obj_cgroup __rcu *objcg; struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif #ifdef CONFIG_MEMCG_V1 /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ struct memcg1_events_percpu __percpu *events_percpu; unsigned long soft_limit; /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; /* OOM-Killer disable */ int oom_kill_disable; /* protect arrays of thresholds */ struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify; /* Legacy tcp memory accounting */ bool tcpmem_active; int tcpmem_pressure; /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; #endif /* CONFIG_MEMCG_V1 */ struct mem_cgroup_per_node *nodeinfo[]; }; /* * size of first charge trial. * TODO: maybe necessary to use big numbers in big irons or dynamic based of the * workload. */ #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an slabobj_ext vector */ MEMCG_DATA_OBJEXTS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ MEMCG_DATA_KMEM = (1UL << 1), /* the next bit after the last actual flag */ __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { /* slabobj_ext vector failed to allocate */ OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) #ifdef CONFIG_MEMCG static inline bool folio_memcg_kmem(struct folio *folio); /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * * The caller must ensure that the returned memcg won't be released. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex)); return READ_ONCE(objcg->memcg); } /* * __folio_memcg - Get the memory cgroup associated with a non-kmem folio * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * kmem folios. */ static inline struct mem_cgroup *__folio_memcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * __folio_objcg - get the object cgroup associated with a kmem folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper object cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * LRU folios. */ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * folio_memcg - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - exclusive reference * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { if (folio_memcg_kmem(folio)) return obj_cgroup_memcg(__folio_objcg(folio)); return __folio_memcg(folio); } /* * folio_memcg_charged - If a folio is charged to a memory cgroup. * @folio: Pointer to the folio. * * Returns true if folio is charged to a memory cgroup, otherwise returns false. */ static inline bool folio_memcg_charged(struct folio *folio) { if (folio_memcg_kmem(folio)) return __folio_objcg(folio) != NULL; return __folio_memcg(folio) != NULL; } /* * folio_memcg_check - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function unlike folio_memcg() can take any folio * as an argument. It has to be used in cases when it's not known if a folio * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - exclusive reference * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { /* * Because folio->memcg_data might be changed asynchronously * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); return obj_cgroup_memcg(objcg); } return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } static inline struct mem_cgroup *page_memcg_check(struct page *page) { if (PageTail(page)) return NULL; return folio_memcg_check((struct folio *)page); } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { struct mem_cgroup *memcg; rcu_read_lock(); retry: memcg = obj_cgroup_memcg(objcg); if (unlikely(!css_tryget(&memcg->css))) goto retry; rcu_read_unlock(); return memcg; } /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. * * Checks if the folio has MemcgKmem flag set. The caller must ensure * that the folio has an associated memory cgroup. It's not safe to call * this function against some types of folios, e.g. slab folios. */ static inline bool folio_memcg_kmem(struct folio *folio) { VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio); return folio->memcg_data & MEMCG_DATA_KMEM; } static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); } static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; if (mem_cgroup_disabled()) return; /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because * mem_cgroup_calculate_protection is not robust enough to keep * the protection invariant for calculated effective values for * parallel reclaimers with different reclaim target. This is * especially a problem for tail memcgs (as they have pages on LRU) * which would want to have effective values 0 for targeted reclaim * but a different value for external reclaim. * * Example * Let's have global and A's reclaim in parallel: * | * A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G) * |\ * | C (low = 1G, usage = 2.5G) * B (low = 1G, usage = 0.5G) * * For the global reclaim * A.elow = A.low * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow * C.elow = min(C.usage, C.low) * * With the effective values resetting we have A reclaim * A.elow = 0 * B.elow = B.low * C.elow = C.low * * If the global reclaim races with A's reclaim then * B.elow = C.elow = 0 because children_low_usage > A.elow) * is possible and reclaiming B would be violating the protection. * */ if (root == memcg) return; *min = READ_ONCE(memcg->memory.emin); *low = READ_ONCE(memcg->memory.elow); } void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support * protection. The target memcg's protection is ignored, see * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || memcg == target; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= page_counter_read(&memcg->memory); } void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** * mem_cgroup_charge - Charge a newly allocated folio to a cgroup. * @folio: Folio to charge. * @mm: mm context of the allocating task. * @gfp: Reclaim mode. * * Try to charge @folio to the memcg that @mm belongs to, reclaiming * pages according to @gfp if necessary. If @mm is NULL, try to * charge to the active memcg. * * Do not use this for folios allocated for swapin. * * Return: 0 on success. Otherwise, an error code is returned. */ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_charge(folio, mm, gfp); } int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); void __mem_cgroup_uncharge(struct folio *folio); /** * mem_cgroup_uncharge - Uncharge a folio. * @folio: Folio to uncharge. * * Uncharge a folio previously charged with mem_cgroup_charge(). */ static inline void mem_cgroup_uncharge(struct folio *folio) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge(folio); } void __mem_cgroup_uncharge_folios(struct folio_batch *folios); static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_folios(folios); } void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * @pgdat: pglist_data * * Returns the lru list vector holding pages for a given @memcg & * @pgdat combination. This can be the node lruvec, if the memory * controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { lruvec = &pgdat->__lruvec; goto out; } if (!memcg) memcg = root_mem_cgroup; mz = memcg->nodeinfo[pgdat->node_id]; lruvec = &mz->lruvec; out: /* * Since a node can be onlined after the mem_cgroup was created, * we have to be prepared to initialize lruvec->pgdat here; * and if offlined then reonlined, we need to reinitialize it. */ if (unlikely(lruvec->pgdat != pgdat)) lruvec->pgdat = pgdat; return lruvec; } /** * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * * This function relies on folio->mem_cgroup being stable. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { struct mem_cgroup *memcg = folio_memcg(folio); VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio); return mem_cgroup_lruvec(memcg, folio_pgdat(folio)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_current(void); struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio); struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); #ifdef CONFIG_DEBUG_VM void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); #else static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } #endif static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { return percpu_ref_tryget(&objcg->refcnt); } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { percpu_ref_get(&objcg->refcnt); } static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, unsigned long nr) { percpu_ref_get_many(&objcg->refcnt, nr); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { if (objcg) percpu_ref_put(&objcg->refcnt); } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return !memcg || css_tryget(&memcg->css); } static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) { return !memcg || css_tryget_online(&memcg->css); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; if (mem_cgroup_disabled()) return NULL; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find * * Returns the parent memcg, or NULL if this is the root. */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return mem_cgroup_from_css(memcg->css.parent); } static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { if (root == memcg) return true; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; bool match = false; rcu_read_lock(); task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (task_memcg) match = mem_cgroup_is_descendant(task_memcg, memcg); rcu_read_unlock(); return match; } struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return true; return !!(memcg->css.flags & CSS_ONLINE); } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { struct mem_cgroup_per_node *mz; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); unsigned long mem_cgroup_size(struct mem_cgroup *memcg); void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val); /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_memcg_state(memcg, idx, val); local_irq_restore(flags); } static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = folio_memcg(page_folio(page)); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { unsigned long flags; local_irq_save(flags); __count_memcg_events(memcg, idx, count); local_irq_restore(flags); } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { struct mem_cgroup *memcg = folio_memcg(folio); if (memcg) count_memcg_events(memcg, idx, nr); } static inline void count_memcg_events_mm(struct mm_struct *mm, enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) count_memcg_events(memcg, idx, count); rcu_read_unlock(); } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { count_memcg_events_mm(mm, idx, 1); } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; atomic_long_inc(&memcg->memory_events_local[event]); if (!swap_event) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); if (swap_event) cgroup_file_notify(&memcg->swap_events_file); else cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) memcg_memory_event(memcg, event); rcu_read_unlock(); } void split_page_memcg(struct page *head, int old_order, int new_order); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; } static inline bool folio_memcg_charged(struct folio *folio) { return false; } static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { return NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { return NULL; } static inline bool folio_memcg_kmem(struct folio *folio) { return false; } static inline bool PageMemcgKmem(struct page *page) { return false; } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_disabled(void) { return true; } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg) { } static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) { } static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages) { return 0; } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; } static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) { } static inline void mem_cgroup_uncharge(struct folio *folio) { } static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { } static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { } static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); return &pgdat->__lruvec; } static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { return true; } static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_current(void) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { return NULL; } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) { return true; } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flagsp) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { return NULL; } static inline void mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev) { } static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(id); /* XXX: This should always return root_mem_cgroup */ return NULL; } #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; } static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; } static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { return 0; } static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) { return 0; } static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { } static inline void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { return NULL; } static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } static inline void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) { } static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) { } static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { } static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); __mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); mod_node_page_state(page_pgdat(page), idx, val); } static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { } static inline void count_memcg_events_mm(struct mm_struct *mm, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } #endif /* CONFIG_MEMCG */ /* * Extended information for slab objects stored as an array in page->memcg_data * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { #ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref ref; #endif } __aligned(8); static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, 1); } static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; memcg = lruvec_memcg(lruvec); if (!memcg) return NULL; memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } static inline void unlock_page_lruvec(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); } /* Test requires a stable folio->memcg binding, see folio_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { return lruvec_pgdat(lruvec) == folio_pgdat(folio) && lruvec_memcg(lruvec) == folio_memcg(folio); } /* Don't lock again iff page's lruvec locked */ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, struct lruvec *locked_lruvec) { if (locked_lruvec) { if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); } /* Don't lock again iff folio's lruvec locked */ static inline void folio_lruvec_relock_irqsave(struct folio *folio, struct lruvec **lruvecp, unsigned long *flags) { if (*lruvecp) { if (folio_matches_lruvec(folio, *lruvecp)) return; unlock_page_lruvec_irqrestore(*lruvecp, *flags); } *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; memcg = folio_memcg(folio); if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); #else /* CONFIG_CGROUP_WRITEBACK */ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) { return NULL; } static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback) { } static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { } static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { #ifdef CONFIG_MEMCG_V1 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return !!memcg->tcpmem_pressure; #endif /* CONFIG_MEMCG_V1 */ do { if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; } int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } #endif #ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); /* * The returned objcg pointer is safe to use without additional * protection within a scope. The scope is defined either by * the current task (similar to the "current" global variable) * or by set_active_memcg() pair. * Please, use obj_cgroup_get() to get a reference if the pointer * needs to be used outside of the local scope. */ struct obj_cgroup *current_obj_cgroup(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); static inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct obj_cgroup *objcg = current_obj_cgroup(); if (objcg) obj_cgroup_get(objcg); return objcg; } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); extern struct static_key_false memcg_bpf_enabled_key; static inline bool memcg_bpf_enabled(void) { return static_branch_likely(&memcg_bpf_enabled_key); } extern struct static_key_false memcg_kmem_online_key; static inline bool memcg_kmem_online(void) { return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } /* * A helper for accessing memcg's kmem_id, used for getting * corresponding LRU lists. */ static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return memcg ? memcg->kmemcg_id : -1; } struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; if (!memcg_kmem_online()) return; rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); count_memcg_events(memcg, idx, count); rcu_read_unlock(); } #else static inline bool mem_cgroup_kmem_disabled(void) { return true; } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { } static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { return NULL; } static inline bool memcg_bpf_enabled(void) { return false; } static inline bool memcg_kmem_online(void) { return false; } static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return -1; } static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { return NULL; } static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, unsigned long count) { } #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { return true; } static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ return true; } #endif /* Cgroup v1-related declarations */ #ifdef CONFIG_MEMCG_V1 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); bool mem_cgroup_oom_synchronize(bool wait); static inline bool task_in_memcg_oom(struct task_struct *p) { return p->memcg_in_oom; } static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); current->in_user_fault = 1; } static inline void mem_cgroup_exit_user_fault(void) { WARN_ON(!current->in_user_fault); current->in_user_fault = 0; } #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { return 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { return false; } static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } static inline void mem_cgroup_enter_user_fault(void) { } static inline void mem_cgroup_exit_user_fault(void) { } #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */
15 8 8 5 52 1 1 15 15 15 5 47 9 9 1 8 15 9 9 9 9 9 7 5 5 22 7 15 9 9 9 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018, 2021-2024 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "core.h" #include "trace.h" static inline int rdev_suspend(struct cfg80211_registered_device *rdev, struct cfg80211_wowlan *wowlan) { int ret; trace_rdev_suspend(&rdev->wiphy, wowlan); ret = rdev->ops->suspend(&rdev->wiphy, wowlan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_resume(struct cfg80211_registered_device *rdev) { int ret; trace_rdev_resume(&rdev->wiphy); ret = rdev->ops->resume(&rdev->wiphy); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev, bool enabled) { trace_rdev_set_wakeup(&rdev->wiphy, enabled); rdev->ops->set_wakeup(&rdev->wiphy, enabled); trace_rdev_return_void(&rdev->wiphy); } static inline struct wireless_dev *rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct wireless_dev *ret; trace_rdev_add_virtual_intf(&rdev->wiphy, name, type); ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type, type, params); trace_rdev_return_wdev(&rdev->wiphy, ret); return ret; } static inline int rdev_del_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_del_virtual_intf(&rdev->wiphy, wdev); ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_virtual_intf(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { int ret; trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type); ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { int ret; trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params->mode); ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)) { int ret; trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, cookie, callback); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { int ret; trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast) { int ret; trace_rdev_set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index) { int ret; trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index) { int ret; trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id, key_index); ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_settings *settings) { int ret; trace_rdev_start_ap(&rdev->wiphy, dev, settings); ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_update *info) { int ret; trace_rdev_change_beacon(&rdev->wiphy, dev, info); ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id) { int ret; trace_rdev_stop_ap(&rdev->wiphy, dev, link_id); ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_add_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct station_del_parameters *params) { int ret; trace_rdev_del_station(&rdev->wiphy, dev, params); ret = rdev->ops->del_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_change_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_station(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_get_station(&rdev->wiphy, dev, mac); ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_dump_station(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac); ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst) { int ret; trace_rdev_del_mpath(&rdev->wiphy, dev, dst); ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp); ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop); ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp); ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_config *conf) { int ret; trace_rdev_get_mesh_config(&rdev->wiphy, dev); ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf); trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf); return ret; } static inline int rdev_update_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { int ret; trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf); ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { int ret; trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup); ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_mesh(&rdev->wiphy, dev); ret = rdev->ops->leave_mesh(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ocb_setup *setup) { int ret; trace_rdev_join_ocb(&rdev->wiphy, dev, setup); ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ocb(&rdev->wiphy, dev); ret = rdev->ops->leave_ocb(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_bss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct bss_parameters *params) { int ret; trace_rdev_change_bss(&rdev->wiphy, dev, params); ret = rdev->ops->change_bss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_inform_bss(struct cfg80211_registered_device *rdev, struct cfg80211_bss *bss, const struct cfg80211_bss_ies *ies, void *drv_data) { trace_rdev_inform_bss(&rdev->wiphy, bss); if (rdev->ops->inform_bss) rdev->ops->inform_bss(&rdev->wiphy, bss, ies, drv_data); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_txq_params *params) { int ret; trace_rdev_set_txq_params(&rdev->wiphy, dev, params); ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan) { int ret; trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan); ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_monitor_channel(&rdev->wiphy, dev, chandef); ret = rdev->ops->set_monitor_channel(&rdev->wiphy, dev, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_scan(struct cfg80211_registered_device *rdev, struct cfg80211_scan_request *request) { int ret; if (WARN_ON_ONCE(!request->n_ssids && request->ssids)) return -EINVAL; trace_rdev_scan(&rdev->wiphy, request); ret = rdev->ops->scan(&rdev->wiphy, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_abort_scan(&rdev->wiphy, wdev); rdev->ops->abort_scan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) { int ret; trace_rdev_auth(&rdev->wiphy, dev, req); ret = rdev->ops->auth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req) { int ret; trace_rdev_assoc(&rdev->wiphy, dev, req); ret = rdev->ops->assoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_deauth_request *req) { int ret; trace_rdev_deauth(&rdev->wiphy, dev, req); ret = rdev->ops->deauth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_disassoc_request *req) { int ret; trace_rdev_disassoc(&rdev->wiphy, dev, req); ret = rdev->ops->disassoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme) { int ret; trace_rdev_connect(&rdev->wiphy, dev, sme); ret = rdev->ops->connect(&rdev->wiphy, dev, sme); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_connect_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme, u32 changed) { int ret; trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed); ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason_code) { int ret; trace_rdev_disconnect(&rdev->wiphy, dev, reason_code); ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params) { int ret; trace_rdev_join_ibss(&rdev->wiphy, dev, params); ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ibss(&rdev->wiphy, dev); ret = rdev->ops->leave_ibss(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) { int ret = -EOPNOTSUPP; trace_rdev_set_wiphy_params(&rdev->wiphy, changed); if (rdev->ops->set_wiphy_params) ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm) { int ret; trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm); ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int *dbm) { int ret; trace_rdev_get_tx_power(&rdev->wiphy, wdev); ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } static inline int rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, struct net_device *dev, const bool enabled) { int ret; trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled); ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_txq_stats(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { int ret; trace_rdev_get_txq_stats(&rdev->wiphy, wdev); ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) { trace_rdev_rfkill_poll(&rdev->wiphy); rdev->ops->rfkill_poll(&rdev->wiphy); trace_rdev_return_void(&rdev->wiphy); } #ifdef CONFIG_NL80211_TESTMODE static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, void *data, int len) { int ret; trace_rdev_testmode_cmd(&rdev->wiphy, wdev); ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { int ret; trace_rdev_testmode_dump(&rdev->wiphy); ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } #endif static inline int rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, const u8 *peer, const struct cfg80211_bitrate_mask *mask) { int ret; trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev, struct net_device *netdev, int idx, struct survey_info *info) { int ret; trace_rdev_dump_survey(&rdev->wiphy, netdev, idx); ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info); if (ret < 0) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info); return ret; } static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev) { int ret; trace_rdev_flush_pmksa(&rdev->wiphy, netdev); ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { int ret; trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration); ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan, duration, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { int ret; trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params); ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, const bool noencrypt, int link, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, link); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, link, cookie); if (cookie) trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); else trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, bool enabled, int timeout) { int ret; trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { int ret; trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 low, s32 high) { int ret; trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 rate, u32 pkts, u32 intvl) { int ret; trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { might_sleep(); trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); if (rdev->ops->update_mgmt_frame_registrations) rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev, u32 tx_ant, u32 rx_ant) { int ret; trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant); ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, u32 *tx_ant, u32 *rx_ant) { int ret; trace_rdev_get_antenna(&rdev->wiphy); ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant); if (ret) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant, *rx_ant); return ret; } static inline int rdev_sched_scan_start(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_sched_scan_request *request) { int ret; trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid); ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev, struct net_device *dev, u64 reqid) { int ret; trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid); ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { int ret; trace_rdev_set_rekey_data(&rdev->wiphy, dev); ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, int link_id, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *buf, size_t len) { int ret; trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, enum nl80211_tdls_operation oper) { int ret; trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper); ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_client(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u64 *cookie) { int ret; trace_rdev_probe_client(&rdev->wiphy, dev, peer); ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 noack_map) { int ret; trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map); ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_get_channel(&rdev->wiphy, wdev, link_id); ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef); trace_rdev_return_chandef(&rdev->wiphy, ret, chandef); return ret; } static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_start_p2p_device(&rdev->wiphy, wdev); ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_p2p_device(&rdev->wiphy, wdev); rdev->ops->stop_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { int ret; trace_rdev_start_nan(&rdev->wiphy, wdev, conf); ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_nan(&rdev->wiphy, wdev); rdev->ops->stop_nan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { int ret; trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func); ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie); rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_nan_change_conf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { int ret; trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes); if (rdev->ops->nan_change_conf) ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf, changes); else ret = -EOPNOTSUPP; trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) { int ret; trace_rdev_set_mac_acl(&rdev->wiphy, dev, params); ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_ft_ies_params *ftie) { int ret; trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie); ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_crit_proto_id protocol, u16 duration) { int ret; trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration); ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev, protocol, duration); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_crit_proto_stop(&rdev->wiphy, wdev); rdev->ops->crit_proto_stop(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_csa_settings *params) { int ret; trace_rdev_channel_switch(&rdev->wiphy, dev, params); ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_qos_map *qos_map) { int ret = -EOPNOTSUPP; if (rdev->ops->set_qos_map) { trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map); ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map); trace_rdev_return_int(&rdev->wiphy, ret); } return ret; } static inline int rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time) { int ret = -EOPNOTSUPP; trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); if (rdev->ops->add_tx_ts) ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer) { int ret = -EOPNOTSUPP; trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); if (rdev->ops->del_tx_ts) ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr) { trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_radar_detection(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms, int link_id) { int ret = -EOPNOTSUPP; trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms, link_id); if (rdev->ops->start_radar_detection) ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_end_cac(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id) { trace_rdev_end_cac(&rdev->wiphy, dev, link_id); if (rdev->ops->end_cac) rdev->ops->end_cac(&rdev->wiphy, dev, link_id); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, int mcast_rate[NUM_NL80211_BANDS]) { int ret = -EOPNOTSUPP; trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate); if (rdev->ops->set_mcast_rate) ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_coalesce(struct cfg80211_registered_device *rdev, struct cfg80211_coalesce *coalesce) { int ret = -EOPNOTSUPP; trace_rdev_set_coalesce(&rdev->wiphy, coalesce); if (rdev->ops->set_coalesce) ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_pmk_conf *pmk_conf) { int ret = -EOPNOTSUPP; trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf); if (rdev->ops->set_pmk) ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *aa) { int ret = -EOPNOTSUPP; trace_rdev_del_pmk(&rdev->wiphy, dev, aa); if (rdev->ops->del_pmk) ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_external_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_external_auth_params *params) { int ret = -EOPNOTSUPP; trace_rdev_external_auth(&rdev->wiphy, dev, params); if (rdev->ops->external_auth) ret = rdev->ops->external_auth(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { int ret = -EOPNOTSUPP; trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); if (rdev->ops->get_ftm_responder_stats) ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->start_pmsr) ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->abort_pmsr) rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_owe_info *oweinfo) { int ret = -EOPNOTSUPP; trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo); if (rdev->ops->update_owe_info) ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *dest, const void *buf, size_t len) { int ret; trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len); ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { int ret; trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf); ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u8 tids) { int ret; trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids); ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev, struct cfg80211_sar_specs *sar) { int ret; trace_rdev_set_sar_specs(&rdev->wiphy, sar); ret = rdev->ops->set_sar_specs(&rdev->wiphy, sar); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_color_change(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_color_change_settings *params) { int ret; trace_rdev_color_change(&rdev->wiphy, dev, params); ret = rdev->ops->color_change(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_fils_aad(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_fils_aad *fils_aad) { int ret = -EOPNOTSUPP; trace_rdev_set_fils_aad(&rdev->wiphy, dev, fils_aad); if (rdev->ops->set_fils_aad) ret = rdev->ops->set_fils_aad(&rdev->wiphy, dev, fils_aad); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_radar_background(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_radar_background(wiphy, chandef); if (rdev->ops->set_radar_background) ret = rdev->ops->set_radar_background(wiphy, chandef); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_add_intf_link(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id) { int ret = 0; trace_rdev_add_intf_link(&rdev->wiphy, wdev, link_id); if (rdev->ops->add_intf_link) ret = rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_intf_link(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id) { trace_rdev_del_intf_link(&rdev->wiphy, wdev, link_id); if (rdev->ops->del_intf_link) rdev->ops->del_intf_link(&rdev->wiphy, wdev, link_id); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_add_link_station(&rdev->wiphy, dev, params); if (rdev->ops->add_link_station) ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mod_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_mod_link_station(&rdev->wiphy, dev, params); if (rdev->ops->mod_link_station) ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_del_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_del_link_station(&rdev->wiphy, dev, params); if (rdev->ops->del_link_station) ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_hw_timestamp(wiphy, dev, hwts); if (rdev->ops->set_hw_timestamp) ret = rdev->ops->set_hw_timestamp(wiphy, dev, hwts); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_set_ttlm(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ttlm_params *params) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_ttlm(wiphy, dev, params); if (rdev->ops->set_ttlm) ret = rdev->ops->set_ttlm(wiphy, dev, params); trace_rdev_return_int(wiphy, ret); return ret; } static inline u32 rdev_get_radio_mask(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wiphy *wiphy = &rdev->wiphy; if (!rdev->ops->get_radio_mask) return 0; return rdev->ops->get_radio_mask(wiphy, dev); } #endif /* __CFG80211_RDEV_OPS */
15 15 15 15 20 20 17 1 2 15 15 15 2 1 15 15 15 5 5 5 5 5 5 14 15 15 15 14 146 146 15 15 14 15 15 6 10 15 15 15 7 5 25 8 25 25 38 21 36 36 36 19 19 61 62 62 7 61 61 61 6 61 62 25 25 15 25 25 25 25 61 74 74 1 74 74 74 74 12 6 7 7 7 3 3 1 1 2 9 9 1 1 1 3 1 20 2 8 3 1 3 8 2 1 1 1 1 24 24 2 20 2 17 7 14 7 7 7 13 13 13 7 7 7 7 7 64 22 2 11 38 38 13 13 13 7 7 5 7 7 25 4 4 9 9 1 3 4 2 37 37 8 8 6 15 15 15 8 8 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge multicast support. * * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/err.h> #include <linux/export.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/log2.h> #include <linux/netdevice.h> #include <linux/netfilter_bridge.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/inetdevice.h> #include <linux/mroute.h> #include <net/ip.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/icmpv6.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> #endif #include <trace/events/bridge.h> #include "br_private.h" #include "br_private_mcast_eht.h" static const struct rhashtable_params br_mdb_rht_params = { .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode), .key_offset = offsetof(struct net_bridge_mdb_entry, addr), .key_len = sizeof(struct br_ip), .automatic_shrinking = true, }; static const struct rhashtable_params br_sg_port_rht_params = { .head_offset = offsetof(struct net_bridge_port_group, rhnode), .key_offset = offsetof(struct net_bridge_port_group, key), .key_len = sizeof(struct net_bridge_port_group_sg_key), .automatic_shrinking = true, }; static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query); static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx); static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src); static void br_multicast_port_group_rexmit(struct timer_list *t); static void br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted); static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx); #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif static struct net_bridge_port_group * __br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1, bool blocked); static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); static void __br_multicast_stop(struct net_bridge_mcast *brmctx); static int br_mc_disabled_update(struct net_device *dev, bool value, struct netlink_ext_ack *extack); static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, struct net_bridge_port_group_sg_key *sg_p) { lockdep_assert_held_once(&br->multicast_lock); return rhashtable_lookup_fast(&br->sg_port_tbl, sg_p, br_sg_port_rht_params); } static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br, struct br_ip *dst) { return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); } struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst) { struct net_bridge_mdb_entry *ent; lockdep_assert_held_once(&br->multicast_lock); rcu_read_lock(); ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); rcu_read_unlock(); return ent; } static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br, __be32 dst, __u16 vid) { struct br_ip br_dst; memset(&br_dst, 0, sizeof(br_dst)); br_dst.dst.ip4 = dst; br_dst.proto = htons(ETH_P_IP); br_dst.vid = vid; return br_mdb_ip_get(br, &br_dst); } #if IS_ENABLED(CONFIG_IPV6) static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, const struct in6_addr *dst, __u16 vid) { struct br_ip br_dst; memset(&br_dst, 0, sizeof(br_dst)); br_dst.dst.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); br_dst.vid = vid; return br_mdb_ip_get(br, &br_dst); } #endif struct net_bridge_mdb_entry * br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { struct net_bridge *br = brmctx->br; struct br_ip ip; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || br_multicast_ctx_vlan_global_disabled(brmctx)) return NULL; if (BR_INPUT_SKB_CB(skb)->igmp) return NULL; memset(&ip, 0, sizeof(ip)); ip.proto = skb->protocol; ip.vid = vid; switch (skb->protocol) { case htons(ETH_P_IP): ip.dst.ip4 = ip_hdr(skb)->daddr; if (brmctx->multicast_igmp_version == 3) { struct net_bridge_mdb_entry *mdb; ip.src.ip4 = ip_hdr(skb)->saddr; mdb = br_mdb_ip_get_rcu(br, &ip); if (mdb) return mdb; ip.src.ip4 = 0; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip.dst.ip6 = ipv6_hdr(skb)->daddr; if (brmctx->multicast_mld_version == 2) { struct net_bridge_mdb_entry *mdb; ip.src.ip6 = ipv6_hdr(skb)->saddr; mdb = br_mdb_ip_get_rcu(br, &ip); if (mdb) return mdb; memset(&ip.src.ip6, 0, sizeof(ip.src.ip6)); } break; #endif default: ip.proto = 0; ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest); } return br_mdb_ip_get_rcu(br, &ip); } /* IMPORTANT: this function must be used only when the contexts cannot be * passed down (e.g. timer) and must be used for read-only purposes because * the vlan snooping option can change, so it can return any context * (non-vlan or vlan). Its initial intended purpose is to read timer values * from the *current* context based on the option. At worst that could lead * to inconsistent timers when the contexts are changed, i.e. src timer * which needs to re-arm with a specific delay taken from the old context */ static struct net_bridge_mcast_port * br_multicast_pg_to_port_ctx(const struct net_bridge_port_group *pg) { struct net_bridge_mcast_port *pmctx = &pg->key.port->multicast_ctx; struct net_bridge_vlan *vlan; lockdep_assert_held_once(&pg->key.port->br->multicast_lock); /* if vlan snooping is disabled use the port's multicast context */ if (!pg->key.addr.vid || !br_opt_get(pg->key.port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) goto out; /* locking is tricky here, due to different rules for multicast and * vlans we need to take rcu to find the vlan and make sure it has * the BR_VLFLAG_MCAST_ENABLED flag set, it can only change under * multicast_lock which must be already held here, so the vlan's pmctx * can safely be used on return */ rcu_read_lock(); vlan = br_vlan_find(nbp_vlan_group_rcu(pg->key.port), pg->key.addr.vid); if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) pmctx = &vlan->port_mcast_ctx; else pmctx = NULL; rcu_read_unlock(); out: return pmctx; } static struct net_bridge_mcast_port * br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid) { struct net_bridge_mcast_port *pmctx = NULL; struct net_bridge_vlan *vlan; lockdep_assert_held_once(&port->br->multicast_lock); if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) return NULL; /* Take RCU to access the vlan. */ rcu_read_lock(); vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid); if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) pmctx = &vlan->port_mcast_ctx; rcu_read_unlock(); return pmctx; } /* when snooping we need to check if the contexts should be used * in the following order: * - if pmctx is non-NULL (port), check if it should be used * - if pmctx is NULL (bridge), check if brmctx should be used */ static bool br_multicast_ctx_should_use(const struct net_bridge_mcast *brmctx, const struct net_bridge_mcast_port *pmctx) { if (!netif_running(brmctx->br->dev)) return false; if (pmctx) return !br_multicast_port_ctx_state_disabled(pmctx); else return !br_multicast_ctx_vlan_disabled(brmctx); } static bool br_port_group_equal(struct net_bridge_port_group *p, struct net_bridge_port *port, const unsigned char *src) { if (p->key.port != port) return false; if (!(port->flags & BR_MULTICAST_TO_UNICAST)) return true; return ether_addr_equal(src, p->eth_addr); } static void __fwd_add_star_excl(struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge_port_group *src_pg; struct net_bridge_mcast *brmctx; memset(&sg_key, 0, sizeof(sg_key)); brmctx = br_multicast_port_ctx_get_global(pmctx); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; if (br_sg_port_find(brmctx->br, &sg_key)) return; src_pg = __br_multicast_add_group(brmctx, pmctx, sg_ip, pg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) return; src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; } static void __fwd_del_star_excl(struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *src_pg; memset(&sg_key, 0, sizeof(sg_key)); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; src_pg = br_sg_port_find(br, &sg_key); if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) || src_pg->rt_protocol != RTPROT_KERNEL) return; br_multicast_find_del_pg(br, src_pg); } /* When a port group transitions to (or is added as) EXCLUDE we need to add it * to all other ports' S,G entries which are not blocked by the current group * for proper replication, the assumption is that any S,G blocked entries * are already added so the S,G,port lookup should skip them. * When a port group transitions from EXCLUDE -> INCLUDE mode or is being * deleted we need to remove it from all ports' S,G entries where it was * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL). */ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode) { struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *pg_lst; struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr))) return; mp = br_mdb_ip_get(br, &pg->key.addr); if (!mp) return; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; for (pg_lst = mlock_dereference(mp->ports, br); pg_lst; pg_lst = mlock_dereference(pg_lst->next, br)) { struct net_bridge_group_src *src_ent; if (pg_lst == pg) continue; hlist_for_each_entry(src_ent, &pg_lst->src_list, node) { if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) continue; sg_ip.src = src_ent->addr.src; switch (filter_mode) { case MCAST_INCLUDE: __fwd_del_star_excl(pg, &sg_ip); break; case MCAST_EXCLUDE: __fwd_add_star_excl(pmctx, pg, &sg_ip); break; } } } } /* called when adding a new S,G with host_joined == false by default */ static void br_multicast_sg_host_state(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg) { struct net_bridge_mdb_entry *sg_mp; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; if (!star_mp->host_joined) return; sg_mp = br_mdb_ip_get(star_mp->br, &sg->key.addr); if (!sg_mp) return; sg_mp->host_joined = true; } /* set the host_joined state of all of *,G's S,G entries */ static void br_multicast_star_g_host_state(struct net_bridge_mdb_entry *star_mp) { struct net_bridge *br = star_mp->br; struct net_bridge_mdb_entry *sg_mp; struct net_bridge_port_group *pg; struct br_ip sg_ip; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = star_mp->addr; for (pg = mlock_dereference(star_mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) { struct net_bridge_group_src *src_ent; hlist_for_each_entry(src_ent, &pg->src_list, node) { if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) continue; sg_ip.src = src_ent->addr.src; sg_mp = br_mdb_ip_get(br, &sg_ip); if (!sg_mp) continue; sg_mp->host_joined = star_mp->host_joined; } } } static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; /* *,G exclude ports are only added to S,G entries */ if (WARN_ON(br_multicast_is_star_g(&sgmp->addr))) return; /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports * we should ignore perm entries since they're managed by user-space */ for (pp = &sgmp->ports; (p = mlock_dereference(*pp, sgmp->br)) != NULL; pp = &p->next) if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL | MDB_PG_FLAGS_PERMANENT))) return; /* currently the host can only have joined the *,G which means * we treat it as EXCLUDE {}, so for an S,G it's considered a * STAR_EXCLUDE entry and we can safely leave it */ sgmp->host_joined = false; for (pp = &sgmp->ports; (p = mlock_dereference(*pp, sgmp->br)) != NULL;) { if (!(p->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_del_pg(sgmp, p, pp); else pp = &p->next; } } void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = star_mp->br; struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *pg; struct net_bridge_mcast *brmctx; if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) return; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; br_multicast_sg_host_state(star_mp, sg); memset(&sg_key, 0, sizeof(sg_key)); sg_key.addr = sg->key.addr; /* we need to add all exclude ports to the S,G */ for (pg = mlock_dereference(star_mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) { struct net_bridge_port_group *src_pg; if (pg == sg || pg->filter_mode == MCAST_INCLUDE) continue; sg_key.port = pg->key.port; if (br_sg_port_find(br, &sg_key)) continue; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) continue; brmctx = br_multicast_port_ctx_get_global(pmctx); src_pg = __br_multicast_add_group(brmctx, pmctx, &sg->key.addr, sg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) continue; src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; } } static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) { struct net_bridge_mdb_entry *star_mp; struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *sg; struct net_bridge_mcast *brmctx; struct br_ip sg_ip; if (src->flags & BR_SGRP_F_INSTALLED) return; memset(&sg_ip, 0, sizeof(sg_ip)); pmctx = br_multicast_pg_to_port_ctx(src->pg); if (!pmctx) return; brmctx = br_multicast_port_ctx_get_global(pmctx); sg_ip = src->pg->key.addr; sg_ip.src = src->addr.src; sg = __br_multicast_add_group(brmctx, pmctx, &sg_ip, src->pg->eth_addr, MCAST_INCLUDE, false, !timer_pending(&src->timer)); if (IS_ERR_OR_NULL(sg)) return; src->flags |= BR_SGRP_F_INSTALLED; sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL; /* if it was added by user-space as perm we can skip next steps */ if (sg->rt_protocol != RTPROT_KERNEL && (sg->flags & MDB_PG_FLAGS_PERMANENT)) return; /* the kernel is now responsible for removing this S,G */ del_timer(&sg->timer); star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr); if (!star_mp) return; br_multicast_sg_add_exclude_ports(star_mp, sg); } static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src, bool fastleave) { struct net_bridge_port_group *p, *pg = src->pg; struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; sg_ip.src = src->addr.src; mp = br_mdb_ip_get(src->br, &sg_ip); if (!mp) return; for (pp = &mp->ports; (p = mlock_dereference(*pp, src->br)) != NULL; pp = &p->next) { if (!br_port_group_equal(p, pg->key.port, pg->eth_addr)) continue; if (p->rt_protocol != RTPROT_KERNEL && (p->flags & MDB_PG_FLAGS_PERMANENT) && !(src->flags & BR_SGRP_F_USER_ADDED)) break; if (fastleave) p->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_del_pg(mp, p, pp); break; } src->flags &= ~BR_SGRP_F_INSTALLED; } /* install S,G and based on src's timer enable or disable forwarding */ static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge_port_group *sg; u8 old_flags; br_multicast_fwd_src_add(src); memset(&sg_key, 0, sizeof(sg_key)); sg_key.addr = src->pg->key.addr; sg_key.addr.src = src->addr.src; sg_key.port = src->pg->key.port; sg = br_sg_port_find(src->br, &sg_key); if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT)) return; old_flags = sg->flags; if (timer_pending(&src->timer)) sg->flags &= ~MDB_PG_FLAGS_BLOCKED; else sg->flags |= MDB_PG_FLAGS_BLOCKED; if (old_flags != sg->flags) { struct net_bridge_mdb_entry *sg_mp; sg_mp = br_mdb_ip_get(src->br, &sg_key.addr); if (!sg_mp) return; br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB); } } static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc) { struct net_bridge_mdb_entry *mp; mp = container_of(gc, struct net_bridge_mdb_entry, mcast_gc); WARN_ON(!hlist_unhashed(&mp->mdb_node)); WARN_ON(mp->ports); timer_shutdown_sync(&mp->timer); kfree_rcu(mp, rcu); } static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp) { struct net_bridge *br = mp->br; rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, br_mdb_rht_params); hlist_del_init_rcu(&mp->mdb_node); hlist_add_head(&mp->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); } static void br_multicast_group_expired(struct timer_list *t) { struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer); struct net_bridge *br = mp->br; spin_lock(&br->multicast_lock); if (hlist_unhashed(&mp->mdb_node) || !netif_running(br->dev) || timer_pending(&mp->timer)) goto out; br_multicast_host_leave(mp, true); if (mp->ports) goto out; br_multicast_del_mdb_entry(mp); out: spin_unlock(&br->multicast_lock); } static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc) { struct net_bridge_group_src *src; src = container_of(gc, struct net_bridge_group_src, mcast_gc); WARN_ON(!hlist_unhashed(&src->node)); timer_shutdown_sync(&src->timer); kfree_rcu(src, rcu); } void __br_multicast_del_group_src(struct net_bridge_group_src *src) { struct net_bridge *br = src->pg->key.port->br; hlist_del_init_rcu(&src->node); src->pg->src_ents--; hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); } void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave) { br_multicast_fwd_src_remove(src, fastleave); __br_multicast_del_group_src(src); } static int br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx, struct netlink_ext_ack *extack, const char *what) { u32 max = READ_ONCE(pmctx->mdb_max_entries); u32 n = READ_ONCE(pmctx->mdb_n_entries); if (max && n >= max) { NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u", what, n, max); return -E2BIG; } WRITE_ONCE(pmctx->mdb_n_entries, n + 1); return 0; } static void br_multicast_port_ngroups_dec_one(struct net_bridge_mcast_port *pmctx) { u32 n = READ_ONCE(pmctx->mdb_n_entries); WARN_ON_ONCE(n == 0); WRITE_ONCE(pmctx->mdb_n_entries, n - 1); } static int br_multicast_port_ngroups_inc(struct net_bridge_port *port, const struct br_ip *group, struct netlink_ext_ack *extack) { struct net_bridge_mcast_port *pmctx; int err; lockdep_assert_held_once(&port->br->multicast_lock); /* Always count on the port context. */ err = br_multicast_port_ngroups_inc_one(&port->multicast_ctx, extack, "Port"); if (err) { trace_br_mdb_full(port->dev, group); return err; } /* Only count on the VLAN context if VID is given, and if snooping on * that VLAN is enabled. */ if (!group->vid) return 0; pmctx = br_multicast_port_vid_to_port_ctx(port, group->vid); if (!pmctx) return 0; err = br_multicast_port_ngroups_inc_one(pmctx, extack, "Port-VLAN"); if (err) { trace_br_mdb_full(port->dev, group); goto dec_one_out; } return 0; dec_one_out: br_multicast_port_ngroups_dec_one(&port->multicast_ctx); return err; } static void br_multicast_port_ngroups_dec(struct net_bridge_port *port, u16 vid) { struct net_bridge_mcast_port *pmctx; lockdep_assert_held_once(&port->br->multicast_lock); if (vid) { pmctx = br_multicast_port_vid_to_port_ctx(port, vid); if (pmctx) br_multicast_port_ngroups_dec_one(pmctx); } br_multicast_port_ngroups_dec_one(&port->multicast_ctx); } u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx) { return READ_ONCE(pmctx->mdb_n_entries); } void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max) { WRITE_ONCE(pmctx->mdb_max_entries, max); } u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx) { return READ_ONCE(pmctx->mdb_max_entries); } static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc) { struct net_bridge_port_group *pg; pg = container_of(gc, struct net_bridge_port_group, mcast_gc); WARN_ON(!hlist_unhashed(&pg->mglist)); WARN_ON(!hlist_empty(&pg->src_list)); timer_shutdown_sync(&pg->rexmit_timer); timer_shutdown_sync(&pg->timer); kfree_rcu(pg, rcu); } void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp) { struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; struct hlist_node *tmp; rcu_assign_pointer(*pp, pg->next); hlist_del_init(&pg->mglist); br_multicast_eht_clean_sets(pg); hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) br_multicast_del_group_src(ent, false); br_mdb_notify(br->dev, mp, pg, RTM_DELMDB); if (!br_multicast_is_star_g(&mp->addr)) { rhashtable_remove_fast(&br->sg_port_tbl, &pg->rhnode, br_sg_port_rht_params); br_multicast_sg_del_exclude_ports(mp); } else { br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); } br_multicast_port_ngroups_dec(pg->key.port, pg->key.addr.vid); hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg) { struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; mp = br_mdb_ip_get(br, &pg->key.addr); if (WARN_ON(!mp)) return; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p != pg) continue; br_multicast_del_pg(mp, pg, pp); return; } WARN_ON(1); } static void br_multicast_port_group_expired(struct timer_list *t) { struct net_bridge_port_group *pg = from_timer(pg, t, timer); struct net_bridge_group_src *src_ent; struct net_bridge *br = pg->key.port->br; struct hlist_node *tmp; bool changed; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&pg->timer) || hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT) goto out; changed = !!(pg->filter_mode == MCAST_EXCLUDE); pg->filter_mode = MCAST_INCLUDE; hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) { if (!timer_pending(&src_ent->timer)) { br_multicast_del_group_src(src_ent, false); changed = true; } } if (hlist_empty(&pg->src_list)) { br_multicast_find_del_pg(br, pg); } else if (changed) { struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr); if (changed && br_multicast_is_star_g(&pg->key.addr)) br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); if (WARN_ON(!mp)) goto out; br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB); } out: spin_unlock(&br->multicast_lock); } static void br_multicast_gc(struct hlist_head *head) { struct net_bridge_mcast_gc *gcent; struct hlist_node *tmp; hlist_for_each_entry_safe(gcent, tmp, head, gc_node) { hlist_del_init(&gcent->gc_node); gcent->destroy(gcent); } } static void __br_multicast_query_handle_vlan(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { struct net_bridge_vlan *vlan = NULL; if (pmctx && br_multicast_port_ctx_is_vlan(pmctx)) vlan = pmctx->vlan; else if (br_multicast_ctx_is_vlan(brmctx)) vlan = brmctx->vlan; if (vlan && !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) { u16 vlan_proto; if (br_vlan_get_proto(brmctx->br->dev, &vlan_proto) != 0) return; __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan->vid); } } static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, __be32 ip_dst, __be32 group, bool with_srcs, bool over_lmqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { struct net_bridge_port *p = pg ? pg->key.port : NULL; struct net_bridge_group_src *ent; size_t pkt_size, igmp_hdr_size; unsigned long now = jiffies; struct igmpv3_query *ihv3; void *csum_start = NULL; __sum16 *csum = NULL; struct sk_buff *skb; struct igmphdr *ih; struct ethhdr *eth; unsigned long lmqt; struct iphdr *iph; u16 lmqt_srcs = 0; igmp_hdr_size = sizeof(*ih); if (brmctx->multicast_igmp_version == 3) { igmp_hdr_size = sizeof(*ihv3); if (pg && with_srcs) { lmqt = now + (brmctx->multicast_last_member_interval * brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && ent->src_query_rexmit_cnt > 0) lmqt_srcs++; } if (!lmqt_srcs) return NULL; igmp_hdr_size += lmqt_srcs * sizeof(__be32); } } pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size; if ((p && pkt_size > p->dev->mtu) || pkt_size > brmctx->br->dev->mtu) return NULL; skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IP); skb_reset_mac_header(skb); eth = eth_hdr(skb); ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); ip_eth_mc_map(ip_dst, eth->h_dest); eth->h_proto = htons(ETH_P_IP); skb_put(skb, sizeof(*eth)); skb_set_network_header(skb, skb->len); iph = ip_hdr(skb); iph->tot_len = htons(pkt_size - sizeof(*eth)); iph->version = 4; iph->ihl = 6; iph->tos = 0xc0; iph->id = 0; iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; iph->saddr = br_opt_get(brmctx->br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? inet_select_addr(brmctx->br->dev, 0, RT_SCOPE_LINK) : 0; iph->daddr = ip_dst; ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; ((u8 *)&iph[1])[3] = 0; ip_send_check(iph); skb_put(skb, 24); skb_set_transport_header(skb, skb->len); *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; switch (brmctx->multicast_igmp_version) { case 2: ih = igmp_hdr(skb); ih->type = IGMP_HOST_MEMBERSHIP_QUERY; ih->code = (group ? brmctx->multicast_last_member_interval : brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ih->group = group; ih->csum = 0; csum = &ih->csum; csum_start = (void *)ih; break; case 3: ihv3 = igmpv3_query_hdr(skb); ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY; ihv3->code = (group ? brmctx->multicast_last_member_interval : brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ihv3->group = group; ihv3->qqic = brmctx->multicast_query_interval / HZ; ihv3->nsrcs = htons(lmqt_srcs); ihv3->resv = 0; ihv3->suppress = sflag; ihv3->qrv = 2; ihv3->csum = 0; csum = &ihv3->csum; csum_start = (void *)ihv3; if (!pg || !with_srcs) break; lmqt_srcs = 0; hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && ent->src_query_rexmit_cnt > 0) { ihv3->srcs[lmqt_srcs++] = ent->addr.src.ip4; ent->src_query_rexmit_cnt--; if (need_rexmit && ent->src_query_rexmit_cnt) *need_rexmit = true; } } if (WARN_ON(lmqt_srcs != ntohs(ihv3->nsrcs))) { kfree_skb(skb); return NULL; } break; } if (WARN_ON(!csum || !csum_start)) { kfree_skb(skb); return NULL; } *csum = ip_compute_csum(csum_start, igmp_hdr_size); skb_put(skb, igmp_hdr_size); __skb_pull(skb, sizeof(*eth)); out: return skb; } #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, const struct in6_addr *ip6_dst, const struct in6_addr *group, bool with_srcs, bool over_llqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { struct net_bridge_port *p = pg ? pg->key.port : NULL; struct net_bridge_group_src *ent; size_t pkt_size, mld_hdr_size; unsigned long now = jiffies; struct mld2_query *mld2q; void *csum_start = NULL; unsigned long interval; __sum16 *csum = NULL; struct ipv6hdr *ip6h; struct mld_msg *mldq; struct sk_buff *skb; unsigned long llqt; struct ethhdr *eth; u16 llqt_srcs = 0; u8 *hopopt; mld_hdr_size = sizeof(*mldq); if (brmctx->multicast_mld_version == 2) { mld_hdr_size = sizeof(*mld2q); if (pg && with_srcs) { llqt = now + (brmctx->multicast_last_member_interval * brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && ent->src_query_rexmit_cnt > 0) llqt_srcs++; } if (!llqt_srcs) return NULL; mld_hdr_size += llqt_srcs * sizeof(struct in6_addr); } } pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size; if ((p && pkt_size > p->dev->mtu) || pkt_size > brmctx->br->dev->mtu) return NULL; skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IPV6); /* Ethernet header */ skb_reset_mac_header(skb); eth = eth_hdr(skb); ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); eth->h_proto = htons(ETH_P_IPV6); skb_put(skb, sizeof(*eth)); /* IPv6 header + HbH option */ skb_set_network_header(skb, skb->len); ip6h = ipv6_hdr(skb); *(__force __be32 *)ip6h = htonl(0x60000000); ip6h->payload_len = htons(8 + mld_hdr_size); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ip6h->daddr = *ip6_dst; if (ipv6_dev_get_saddr(dev_net(brmctx->br->dev), brmctx->br->dev, &ip6h->daddr, 0, &ip6h->saddr)) { kfree_skb(skb); br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, false); return NULL; } br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, true); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); hopopt[0] = IPPROTO_ICMPV6; /* next hdr */ hopopt[1] = 0; /* length of HbH */ hopopt[2] = IPV6_TLV_ROUTERALERT; /* Router Alert */ hopopt[3] = 2; /* Length of RA Option */ hopopt[4] = 0; /* Type = 0x0000 (MLD) */ hopopt[5] = 0; hopopt[6] = IPV6_TLV_PAD1; /* Pad1 */ hopopt[7] = IPV6_TLV_PAD1; /* Pad1 */ skb_put(skb, sizeof(*ip6h) + 8); /* ICMPv6 */ skb_set_transport_header(skb, skb->len); interval = ipv6_addr_any(group) ? brmctx->multicast_query_response_interval : brmctx->multicast_last_member_interval; *igmp_type = ICMPV6_MGM_QUERY; switch (brmctx->multicast_mld_version) { case 1: mldq = (struct mld_msg *)icmp6_hdr(skb); mldq->mld_type = ICMPV6_MGM_QUERY; mldq->mld_code = 0; mldq->mld_cksum = 0; mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); mldq->mld_reserved = 0; mldq->mld_mca = *group; csum = &mldq->mld_cksum; csum_start = (void *)mldq; break; case 2: mld2q = (struct mld2_query *)icmp6_hdr(skb); mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval)); mld2q->mld2q_type = ICMPV6_MGM_QUERY; mld2q->mld2q_code = 0; mld2q->mld2q_cksum = 0; mld2q->mld2q_resv1 = 0; mld2q->mld2q_resv2 = 0; mld2q->mld2q_suppress = sflag; mld2q->mld2q_qrv = 2; mld2q->mld2q_nsrcs = htons(llqt_srcs); mld2q->mld2q_qqic = brmctx->multicast_query_interval / HZ; mld2q->mld2q_mca = *group; csum = &mld2q->mld2q_cksum; csum_start = (void *)mld2q; if (!pg || !with_srcs) break; llqt_srcs = 0; hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && ent->src_query_rexmit_cnt > 0) { mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.src.ip6; ent->src_query_rexmit_cnt--; if (need_rexmit && ent->src_query_rexmit_cnt) *need_rexmit = true; } } if (WARN_ON(llqt_srcs != ntohs(mld2q->mld2q_nsrcs))) { kfree_skb(skb); return NULL; } break; } if (WARN_ON(!csum || !csum_start)) { kfree_skb(skb); return NULL; } *csum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, mld_hdr_size, IPPROTO_ICMPV6, csum_partial(csum_start, mld_hdr_size, 0)); skb_put(skb, mld_hdr_size); __skb_pull(skb, sizeof(*eth)); out: return skb; } #endif static struct sk_buff *br_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, bool with_srcs, bool over_lmqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { __be32 ip4_dst; switch (group->proto) { case htons(ETH_P_IP): ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP); return br_ip4_multicast_alloc_query(brmctx, pmctx, pg, ip4_dst, group->dst.ip4, with_srcs, over_lmqt, sflag, igmp_type, need_rexmit); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr ip6_dst; if (ip_dst) ip6_dst = ip_dst->dst.ip6; else ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0, htonl(1)); return br_ip6_multicast_alloc_query(brmctx, pmctx, pg, &ip6_dst, &group->dst.ip6, with_srcs, over_lmqt, sflag, igmp_type, need_rexmit); } #endif } return NULL; } struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, struct br_ip *group) { struct net_bridge_mdb_entry *mp; int err; mp = br_mdb_ip_get(br, group); if (mp) return mp; if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { trace_br_mdb_full(br->dev, group); br_mc_disabled_update(br->dev, false, NULL); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); return ERR_PTR(-E2BIG); } mp = kzalloc(sizeof(*mp), GFP_ATOMIC); if (unlikely(!mp)) return ERR_PTR(-ENOMEM); mp->br = br; mp->addr = *group; mp->mcast_gc.destroy = br_multicast_destroy_mdb_entry; timer_setup(&mp->timer, br_multicast_group_expired, 0); err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode, br_mdb_rht_params); if (err) { kfree(mp); mp = ERR_PTR(err); } else { hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list); } return mp; } static void br_multicast_group_src_expired(struct timer_list *t) { struct net_bridge_group_src *src = from_timer(src, t, timer); struct net_bridge_port_group *pg; struct net_bridge *br = src->br; spin_lock(&br->multicast_lock); if (hlist_unhashed(&src->node) || !netif_running(br->dev) || timer_pending(&src->timer)) goto out; pg = src->pg; if (pg->filter_mode == MCAST_INCLUDE) { br_multicast_del_group_src(src, false); if (!hlist_empty(&pg->src_list)) goto out; br_multicast_find_del_pg(br, pg); } else { br_multicast_fwd_src_handle(src); } out: spin_unlock(&br->multicast_lock); } struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip) { struct net_bridge_group_src *ent; switch (ip->proto) { case htons(ETH_P_IP): hlist_for_each_entry(ent, &pg->src_list, node) if (ip->src.ip4 == ent->addr.src.ip4) return ent; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): hlist_for_each_entry(ent, &pg->src_list, node) if (!ipv6_addr_cmp(&ent->addr.src.ip6, &ip->src.ip6)) return ent; break; #endif } return NULL; } struct net_bridge_group_src * br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip) { struct net_bridge_group_src *grp_src; if (unlikely(pg->src_ents >= PG_SRC_ENT_LIMIT)) return NULL; switch (src_ip->proto) { case htons(ETH_P_IP): if (ipv4_is_zeronet(src_ip->src.ip4) || ipv4_is_multicast(src_ip->src.ip4)) return NULL; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (ipv6_addr_any(&src_ip->src.ip6) || ipv6_addr_is_multicast(&src_ip->src.ip6)) return NULL; break; #endif } grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC); if (unlikely(!grp_src)) return NULL; grp_src->pg = pg; grp_src->br = pg->key.port->br; grp_src->addr = *src_ip; grp_src->mcast_gc.destroy = br_multicast_destroy_group_src; timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0); hlist_add_head_rcu(&grp_src->node, &pg->src_list); pg->src_ents++; return grp_src; } struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, u8 filter_mode, u8 rt_protocol, struct netlink_ext_ack *extack) { struct net_bridge_port_group *p; int err; err = br_multicast_port_ngroups_inc(port, group, extack); if (err) return NULL; p = kzalloc(sizeof(*p), GFP_ATOMIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); goto dec_out; } p->key.addr = *group; p->key.port = port; p->flags = flags; p->filter_mode = filter_mode; p->rt_protocol = rt_protocol; p->eht_host_tree = RB_ROOT; p->eht_set_tree = RB_ROOT; p->mcast_gc.destroy = br_multicast_destroy_port_group; INIT_HLIST_HEAD(&p->src_list); if (!br_multicast_is_star_g(group) && rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode, br_sg_port_rht_params)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't insert new port group"); goto free_out; } rcu_assign_pointer(p->next, next); timer_setup(&p->timer, br_multicast_port_group_expired, 0); timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0); hlist_add_head(&p->mglist, &port->mglist); if (src) memcpy(p->eth_addr, src, ETH_ALEN); else eth_broadcast_addr(p->eth_addr); return p; free_out: kfree(p); dec_out: br_multicast_port_ngroups_dec(port, group->vid); return NULL; } void br_multicast_del_port_group(struct net_bridge_port_group *p) { struct net_bridge_port *port = p->key.port; __u16 vid = p->key.addr.vid; hlist_del_init(&p->mglist); if (!br_multicast_is_star_g(&p->key.addr)) rhashtable_remove_fast(&port->br->sg_port_tbl, &p->rhnode, br_sg_port_rht_params); kfree(p); br_multicast_port_ngroups_dec(port, vid); } void br_multicast_host_join(const struct net_bridge_mcast *brmctx, struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) { mp->host_joined = true; if (br_multicast_is_star_g(&mp->addr)) br_multicast_star_g_host_state(mp); if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB); } if (br_group_is_l2(&mp->addr)) return; mod_timer(&mp->timer, jiffies + brmctx->multicast_membership_interval); } void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) return; mp->host_joined = false; if (br_multicast_is_star_g(&mp->addr)) br_multicast_star_g_host_state(mp); if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB); } static struct net_bridge_port_group * __br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1, bool blocked) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p = NULL; struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; mp = br_multicast_new_group(brmctx->br, group); if (IS_ERR(mp)) return ERR_CAST(mp); if (!pmctx) { br_multicast_host_join(brmctx, mp, true); goto out; } for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (br_port_group_equal(p, pmctx->port, src)) goto found; if ((unsigned long)p->key.port < (unsigned long)pmctx->port) break; } p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src, filter_mode, RTPROT_KERNEL, NULL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); goto out; } rcu_assign_pointer(*pp, p); if (blocked) p->flags |= MDB_PG_FLAGS_BLOCKED; br_mdb_notify(brmctx->br->dev, mp, p, RTM_NEWMDB); found: if (igmpv2_mldv1) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); out: return p; } static int br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1) { struct net_bridge_port_group *pg; int err; spin_lock(&brmctx->br->multicast_lock); pg = __br_multicast_add_group(brmctx, pmctx, group, src, filter_mode, igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ err = PTR_ERR_OR_ZERO(pg); spin_unlock(&brmctx->br->multicast_lock); return err; } static int br_ip4_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src, bool igmpv2) { struct br_ip br_group; u8 filter_mode; if (ipv4_is_local_multicast(group)) return 0; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE; return br_multicast_add_group(brmctx, pmctx, &br_group, src, filter_mode, igmpv2); } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src, bool mldv1) { struct br_ip br_group; u8 filter_mode; if (ipv6_addr_is_ll_all_nodes(group)) return 0; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE; return br_multicast_add_group(brmctx, pmctx, &br_group, src, filter_mode, mldv1); } #endif static bool br_multicast_rport_del(struct hlist_node *rlist) { if (hlist_unhashed(rlist)) return false; hlist_del_init_rcu(rlist); return true; } static bool br_ip4_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { return br_multicast_rport_del(&pmctx->ip4_rlist); } static bool br_ip6_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) return br_multicast_rport_del(&pmctx->ip6_rlist); #else return false; #endif } static void br_multicast_router_expired(struct net_bridge_mcast_port *pmctx, struct timer_list *t, struct hlist_node *rlist) { struct net_bridge *br = pmctx->port->br; bool del; spin_lock(&br->multicast_lock); if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || pmctx->multicast_router == MDB_RTR_TYPE_PERM || timer_pending(t)) goto out; del = br_multicast_rport_del(rlist); br_multicast_rport_del_notify(pmctx, del); out: spin_unlock(&br->multicast_lock); } static void br_ip4_multicast_router_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, ip4_mc_router_timer); br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_router_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, ip6_mc_router_timer); br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist); } #endif static void br_mc_router_state_change(struct net_bridge *p, bool is_mc_router) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, .flags = SWITCHDEV_F_DEFER, .u.mrouter = is_mc_router, }; switchdev_port_attr_set(p->dev, &attr, NULL); } static void br_multicast_local_router_expired(struct net_bridge_mcast *brmctx, struct timer_list *timer) { spin_lock(&brmctx->br->multicast_lock); if (brmctx->multicast_router == MDB_RTR_TYPE_DISABLED || brmctx->multicast_router == MDB_RTR_TYPE_PERM || br_ip4_multicast_is_router(brmctx) || br_ip6_multicast_is_router(brmctx)) goto out; br_mc_router_state_change(brmctx->br, false); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_local_router_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip4_mc_router_timer); br_multicast_local_router_expired(brmctx, t); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_local_router_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip6_mc_router_timer); br_multicast_local_router_expired(brmctx, t); } #endif static void br_multicast_querier_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { spin_lock(&brmctx->br->multicast_lock); if (!netif_running(brmctx->br->dev) || br_multicast_ctx_vlan_global_disabled(brmctx) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) goto out; br_multicast_start_querier(brmctx, query); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_querier_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip4_other_query.timer); br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_querier_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip6_other_query.timer); br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query); } #endif static void br_multicast_query_delay_expired(struct timer_list *t) { } static void br_multicast_select_own_querier(struct net_bridge_mcast *brmctx, struct br_ip *ip, struct sk_buff *skb) { if (ip->proto == htons(ETH_P_IP)) brmctx->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; #if IS_ENABLED(CONFIG_IPV6) else brmctx->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; #endif } static void __br_multicast_send_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, bool with_srcs, u8 sflag, bool *need_rexmit) { bool over_lmqt = !!sflag; struct sk_buff *skb; u8 igmp_type; if (!br_multicast_ctx_should_use(brmctx, pmctx) || !br_multicast_ctx_matches_vlan_snooping(brmctx)) return; again_under_lmqt: skb = br_multicast_alloc_query(brmctx, pmctx, pg, ip_dst, group, with_srcs, over_lmqt, sflag, &igmp_type, need_rexmit); if (!skb) return; if (pmctx) { skb->dev = pmctx->port->dev; br_multicast_count(brmctx->br, pmctx->port, skb, igmp_type, BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, dev_net(pmctx->port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); if (over_lmqt && with_srcs && sflag) { over_lmqt = false; goto again_under_lmqt; } } else { br_multicast_select_own_querier(brmctx, group, skb); br_multicast_count(brmctx->br, NULL, skb, igmp_type, BR_MCAST_DIR_RX); netif_rx(skb); } } static void br_multicast_read_querier(const struct bridge_mcast_querier *querier, struct bridge_mcast_querier *dest) { unsigned int seq; memset(dest, 0, sizeof(*dest)); do { seq = read_seqcount_begin(&querier->seq); dest->port_ifidx = querier->port_ifidx; memcpy(&dest->addr, &querier->addr, sizeof(struct br_ip)); } while (read_seqcount_retry(&querier->seq, seq)); } static void br_multicast_update_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_querier *querier, int ifindex, struct br_ip *saddr) { write_seqcount_begin(&querier->seq); querier->port_ifidx = ifindex; memcpy(&querier->addr, saddr, sizeof(*saddr)); write_seqcount_end(&querier->seq); } static void br_multicast_send_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *own_query) { struct bridge_mcast_other_query *other_query = NULL; struct bridge_mcast_querier *querier; struct br_ip br_group; unsigned long time; if (!br_multicast_ctx_should_use(brmctx, pmctx) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || !brmctx->multicast_querier) return; memset(&br_group.dst, 0, sizeof(br_group.dst)); if (pmctx ? (own_query == &pmctx->ip4_own_query) : (own_query == &brmctx->ip4_own_query)) { querier = &brmctx->ip4_querier; other_query = &brmctx->ip4_other_query; br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) } else { querier = &brmctx->ip6_querier; other_query = &brmctx->ip6_other_query; br_group.proto = htons(ETH_P_IPV6); #endif } if (!other_query || timer_pending(&other_query->timer)) return; /* we're about to select ourselves as querier */ if (!pmctx && querier->port_ifidx) { struct br_ip zeroip = {}; br_multicast_update_querier(brmctx, querier, 0, &zeroip); } __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &br_group, false, 0, NULL); time = jiffies; time += own_query->startup_sent < brmctx->multicast_startup_query_count ? brmctx->multicast_startup_query_interval : brmctx->multicast_query_interval; mod_timer(&own_query->timer, time); } static void br_multicast_port_query_expired(struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *query) { struct net_bridge *br = pmctx->port->br; struct net_bridge_mcast *brmctx; spin_lock(&br->multicast_lock); if (br_multicast_port_ctx_state_stopped(pmctx)) goto out; brmctx = br_multicast_port_ctx_get_global(pmctx); if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; br_multicast_send_query(brmctx, pmctx, query); out: spin_unlock(&br->multicast_lock); } static void br_ip4_multicast_port_query_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, ip4_own_query.timer); br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_port_query_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, ip6_own_query.timer); br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query); } #endif static void br_multicast_port_group_rexmit(struct timer_list *t) { struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer); struct bridge_mcast_other_query *other_query = NULL; struct net_bridge *br = pg->key.port->br; struct net_bridge_mcast_port *pmctx; struct net_bridge_mcast *brmctx; bool need_rexmit = false; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) goto out; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) goto out; brmctx = br_multicast_port_ctx_get_global(pmctx); if (!brmctx->multicast_querier) goto out; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif if (!other_query || timer_pending(&other_query->timer)) goto out; if (pg->grp_query_rexmit_cnt) { pg->grp_query_rexmit_cnt--; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 1, NULL); } __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 0, &need_rexmit); if (pg->grp_query_rexmit_cnt || need_rexmit) mod_timer(&pg->rexmit_timer, jiffies + brmctx->multicast_last_member_interval); out: spin_unlock(&br->multicast_lock); } static int br_mc_disabled_update(struct net_device *dev, bool value, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, .flags = SWITCHDEV_F_DEFER, .u.mc_disabled = !value, }; return switchdev_port_attr_set(dev, &attr, extack); } void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx) { pmctx->port = port; pmctx->vlan = vlan; pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; timer_setup(&pmctx->ip4_mc_router_timer, br_ip4_multicast_router_expired, 0); timer_setup(&pmctx->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) timer_setup(&pmctx->ip6_mc_router_timer, br_ip6_multicast_router_expired, 0); timer_setup(&pmctx->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif } void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) del_timer_sync(&pmctx->ip6_mc_router_timer); #endif del_timer_sync(&pmctx->ip4_mc_router_timer); } int br_multicast_add_port(struct net_bridge_port *port) { int err; port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; br_multicast_port_ctx_init(port, NULL, &port->multicast_ctx); err = br_mc_disabled_update(port->dev, br_opt_get(port->br, BROPT_MULTICAST_ENABLED), NULL); if (err && err != -EOPNOTSUPP) return err; port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!port->mcast_stats) return -ENOMEM; return 0; } void br_multicast_del_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; struct net_bridge_port_group *pg; struct hlist_node *n; /* Take care of the remaining groups, only perm ones should be left */ spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) br_multicast_find_del_pg(br, pg); spin_unlock_bh(&br->multicast_lock); flush_work(&br->mcast_gc_work); br_multicast_port_ctx_deinit(&port->multicast_ctx); free_percpu(port->mcast_stats); } static void br_multicast_enable(struct bridge_mcast_own_query *query) { query->startup_sent = 0; if (try_to_del_timer_sync(&query->timer) >= 0 || del_timer(&query->timer)) mod_timer(&query->timer, jiffies); } static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { struct net_bridge *br = pmctx->port->br; struct net_bridge_mcast *brmctx; brmctx = br_multicast_port_ctx_get_global(pmctx); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev)) return; br_multicast_enable(&pmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&pmctx->ip6_own_query); #endif if (pmctx->multicast_router == MDB_RTR_TYPE_PERM) { br_ip4_multicast_add_router(brmctx, pmctx); br_ip6_multicast_add_router(brmctx, pmctx); } if (br_multicast_port_ctx_is_vlan(pmctx)) { struct net_bridge_port_group *pg; u32 n = 0; /* The mcast_n_groups counter might be wrong. First, * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries * are flushed, thus mcast_n_groups after the toggle does not * reflect the true values. And second, permanent entries added * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected * either. Thus we have to refresh the counter. */ hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) { if (pg->key.addr.vid == pmctx->vlan->vid) n++; } WRITE_ONCE(pmctx->mdb_n_entries, n); } } void br_multicast_enable_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; spin_lock_bh(&br->multicast_lock); __br_multicast_enable_port_ctx(&port->multicast_ctx); spin_unlock_bh(&br->multicast_lock); } static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) { struct net_bridge_port_group *pg; struct hlist_node *n; bool del = false; hlist_for_each_entry_safe(pg, n, &pmctx->port->mglist, mglist) if (!(pg->flags & MDB_PG_FLAGS_PERMANENT) && (!br_multicast_port_ctx_is_vlan(pmctx) || pg->key.addr.vid == pmctx->vlan->vid)) br_multicast_find_del_pg(pmctx->port->br, pg); del |= br_ip4_multicast_rport_del(pmctx); del_timer(&pmctx->ip4_mc_router_timer); del_timer(&pmctx->ip4_own_query.timer); del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) del_timer(&pmctx->ip6_mc_router_timer); del_timer(&pmctx->ip6_own_query.timer); #endif br_multicast_rport_del_notify(pmctx, del); } void br_multicast_disable_port(struct net_bridge_port *port) { spin_lock_bh(&port->br->multicast_lock); __br_multicast_disable_port_ctx(&port->multicast_ctx); spin_unlock_bh(&port->br->multicast_lock); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) { struct net_bridge_group_src *ent; struct hlist_node *tmp; int deleted = 0; hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) if (ent->flags & BR_SGRP_F_DELETE) { br_multicast_del_group_src(ent, false); deleted++; } return deleted; } static void __grp_src_mod_timer(struct net_bridge_group_src *src, unsigned long expires) { mod_timer(&src->timer, expires); br_multicast_fwd_src_handle(src); } static void __grp_src_query_marked_and_rexmit(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; u32 lmqc = brmctx->multicast_last_member_count; unsigned long lmqt, lmi, now = jiffies; struct net_bridge_group_src *ent; if (!netif_running(brmctx->br->dev) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif lmqt = now + br_multicast_lmqt(brmctx); hlist_for_each_entry(ent, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_SEND) { ent->flags &= ~BR_SGRP_F_SEND; if (ent->timer.expires > lmqt) { if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) ent->src_query_rexmit_cnt = lmqc; __grp_src_mod_timer(ent, lmqt); } } } if (!brmctx->multicast_querier || !other_query || timer_pending(&other_query->timer)) return; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 1, NULL); lmi = now + brmctx->multicast_last_member_interval; if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } static void __grp_send_query_and_rexmit(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; unsigned long now = jiffies, lmi; if (!netif_running(brmctx->br->dev) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) { lmi = now + brmctx->multicast_last_member_interval; pg->grp_query_rexmit_cnt = brmctx->multicast_last_member_count - 1; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 0, NULL); if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } if (pg->filter_mode == MCAST_EXCLUDE && (!timer_pending(&pg->timer) || time_after(pg->timer.expires, now + br_multicast_lmqt(brmctx)))) mod_timer(&pg->timer, now + br_multicast_lmqt(brmctx)); } /* State Msg type New state Actions * INCLUDE (A) IS_IN (B) INCLUDE (A+B) (B)=GMI * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI */ static bool br_multicast_isinc_allow(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; u32 src_idx; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; return changed; } /* State Msg type New state Actions * INCLUDE (A) IS_EX (B) EXCLUDE (A*B,B-A) (B-A)=0 * Delete (A-B) * Group Timer=GMI */ static void __grp_src_isexc_incl(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; struct br_ip src_ip; u32 src_idx; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) ent->flags &= ~BR_SGRP_F_DELETE; else ent = br_multicast_new_group_src(pg, &src_ip); if (ent) br_multicast_fwd_src_handle(ent); } br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); __grp_src_delete_marked(pg); } /* State Msg type New state Actions * EXCLUDE (X,Y) IS_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=GMI * Delete (X-A) * Delete (Y-A) * Group Timer=GMI */ static bool __grp_src_isexc_excl(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; u32 src_idx; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); changed = true; } } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; return changed; } static bool br_multicast_isexc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_isexc_incl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: changed = __grp_src_isexc_excl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } /* State Msg type New state Actions * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI * Send Q(G,A-B) */ static bool __grp_src_toin_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_SEND; to_send--; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } /* State Msg type New state Actions * EXCLUDE (X,Y) TO_IN (A) EXCLUDE (X+A,Y-A) (A)=GMI * Send Q(G,X-A) * Send Q(G) */ static bool __grp_src_toin_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) if (timer_pending(&ent->timer)) ent->flags |= BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { if (timer_pending(&ent->timer)) { ent->flags &= ~BR_SGRP_F_SEND; to_send--; } } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); __grp_send_query_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_toin(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: changed = __grp_src_toin_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: changed = __grp_src_toin_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } if (br_multicast_eht_should_del_pg(pg)) { pg->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_find_del_pg(pg->key.port->br, pg); /* a notification has already been sent and we shouldn't * access pg after the delete so we have to return false */ changed = false; } return changed; } /* State Msg type New state Actions * INCLUDE (A) TO_EX (B) EXCLUDE (A*B,B-A) (B-A)=0 * Delete (A-B) * Send Q(G,A*B) * Group Timer=GMI */ static void __grp_src_toex_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) | BR_SGRP_F_SEND; to_send++; } else { ent = br_multicast_new_group_src(pg, &src_ip); } if (ent) br_multicast_fwd_src_handle(ent); } br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); __grp_src_delete_marked(pg); if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); } /* State Msg type New state Actions * EXCLUDE (X,Y) TO_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=Group Timer * Delete (X-A) * Delete (Y-A) * Send Q(G,A-Y) * Group Timer=GMI */ static bool __grp_src_toex_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, pg->timer.expires); changed = true; } } if (ent && timer_pending(&ent->timer)) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_toex(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_toex_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: changed = __grp_src_toex_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } /* State Msg type New state Actions * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B) */ static bool __grp_src_block_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } /* State Msg type New state Actions * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer * Send Q(G,A-Y) */ static bool __grp_src_block_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, pg->timer.expires); changed = true; } } if (ent && timer_pending(&ent->timer)) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_block(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: changed = __grp_src_block_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: changed = __grp_src_block_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } if ((pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) || br_multicast_eht_should_del_pg(pg)) { if (br_multicast_eht_should_del_pg(pg)) pg->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_find_del_pg(pg->key.port->br, pg); /* a notification has already been sent and we shouldn't * access pg after the delete so we have to return false */ changed = false; } return changed; } static struct net_bridge_port_group * br_multicast_find_port(struct net_bridge_mdb_entry *mp, struct net_bridge_port *p, const unsigned char *src) { struct net_bridge *br __maybe_unused = mp->br; struct net_bridge_port_group *pg; for (pg = mlock_dereference(mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) if (br_port_group_equal(pg, p, src)) return pg; return NULL; } static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { bool igmpv2 = brmctx->multicast_igmp_version == 2; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; const unsigned char *src; struct igmpv3_report *ih; struct igmpv3_grec *grec; int i, len, num, type; __be32 group, *h_addr; bool changed = false; int err = 0; u16 nsrcs; ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = skb_transport_offset(skb) + sizeof(*ih); for (i = 0; i < num; i++) { len += sizeof(*grec); if (!ip_mc_may_pull(skb, len)) return -EINVAL; grec = (void *)(skb->data + len - sizeof(*grec)); group = grec->grec_mca; type = grec->grec_type; nsrcs = ntohs(grec->grec_nsrcs); len += nsrcs * 4; if (!ip_mc_may_pull(skb, len)) return -EINVAL; switch (type) { case IGMPV3_MODE_IS_INCLUDE: case IGMPV3_MODE_IS_EXCLUDE: case IGMPV3_CHANGE_TO_INCLUDE: case IGMPV3_CHANGE_TO_EXCLUDE: case IGMPV3_ALLOW_NEW_SOURCES: case IGMPV3_BLOCK_OLD_SOURCES: break; default: continue; } src = eth_hdr(skb)->h_source; if (nsrcs == 0 && (type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE)) { if (!pmctx || igmpv2) { br_ip4_multicast_leave_group(brmctx, pmctx, group, vid, src); continue; } } else { err = br_ip4_multicast_add_group(brmctx, pmctx, group, vid, src, igmpv2); if (err) break; } if (!pmctx || igmpv2) continue; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; mdst = br_mdb_ip4_get(brmctx->br, group, vid); if (!mdst) goto unlock_continue; pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; /* reload grec and host addr */ grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4)); h_addr = &ip_hdr(skb)->saddr; switch (type) { case IGMPV3_ALLOW_NEW_SOURCES: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_INCLUDE: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_EXCLUDE: changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_INCLUDE: changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_EXCLUDE: changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_BLOCK_OLD_SOURCES: changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; } if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: spin_unlock(&brmctx->br->multicast_lock); } return err; } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { bool mldv1 = brmctx->multicast_mld_version == 1; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; unsigned int nsrcs_offset; struct mld2_report *mld2r; const unsigned char *src; struct in6_addr *h_addr; struct mld2_grec *grec; unsigned int grec_len; bool changed = false; int i, len, num; int err = 0; if (!ipv6_mc_may_pull(skb, sizeof(*mld2r))) return -EINVAL; mld2r = (struct mld2_report *)icmp6_hdr(skb); num = ntohs(mld2r->mld2r_ngrec); len = skb_transport_offset(skb) + sizeof(*mld2r); for (i = 0; i < num; i++) { __be16 *_nsrcs, __nsrcs; u16 nsrcs; nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs); if (skb_transport_offset(skb) + ipv6_transport_len(skb) < nsrcs_offset + sizeof(__nsrcs)) return -EINVAL; _nsrcs = skb_header_pointer(skb, nsrcs_offset, sizeof(__nsrcs), &__nsrcs); if (!_nsrcs) return -EINVAL; nsrcs = ntohs(*_nsrcs); grec_len = struct_size(grec, grec_src, nsrcs); if (!ipv6_mc_may_pull(skb, len + grec_len)) return -EINVAL; grec = (struct mld2_grec *)(skb->data + len); len += grec_len; switch (grec->grec_type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: case MLD2_CHANGE_TO_INCLUDE: case MLD2_CHANGE_TO_EXCLUDE: case MLD2_ALLOW_NEW_SOURCES: case MLD2_BLOCK_OLD_SOURCES: break; default: continue; } src = eth_hdr(skb)->h_source; if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && nsrcs == 0) { if (!pmctx || mldv1) { br_ip6_multicast_leave_group(brmctx, pmctx, &grec->grec_mca, vid, src); continue; } } else { err = br_ip6_multicast_add_group(brmctx, pmctx, &grec->grec_mca, vid, src, mldv1); if (err) break; } if (!pmctx || mldv1) continue; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; mdst = br_mdb_ip6_get(brmctx->br, &grec->grec_mca, vid); if (!mdst) goto unlock_continue; pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; h_addr = &ipv6_hdr(skb)->saddr; switch (grec->grec_type) { case MLD2_ALLOW_NEW_SOURCES: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_INCLUDE: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_EXCLUDE: changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_INCLUDE: changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_EXCLUDE: changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_BLOCK_OLD_SOURCES: changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; } if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: spin_unlock(&brmctx->br->multicast_lock); } return err; } #endif static bool br_multicast_select_querier(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *saddr) { int port_ifidx = pmctx ? pmctx->port->dev->ifindex : 0; struct timer_list *own_timer, *other_timer; struct bridge_mcast_querier *querier; switch (saddr->proto) { case htons(ETH_P_IP): querier = &brmctx->ip4_querier; own_timer = &brmctx->ip4_own_query.timer; other_timer = &brmctx->ip4_other_query.timer; if (!querier->addr.src.ip4 || ntohl(saddr->src.ip4) <= ntohl(querier->addr.src.ip4)) goto update; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): querier = &brmctx->ip6_querier; own_timer = &brmctx->ip6_own_query.timer; other_timer = &brmctx->ip6_other_query.timer; if (ipv6_addr_cmp(&saddr->src.ip6, &querier->addr.src.ip6) <= 0) goto update; break; #endif default: return false; } if (!timer_pending(own_timer) && !timer_pending(other_timer)) goto update; return false; update: br_multicast_update_querier(brmctx, querier, port_ifidx, saddr); return true; } static struct net_bridge_port * __br_multicast_get_querier_port(struct net_bridge *br, const struct bridge_mcast_querier *querier) { int port_ifidx = READ_ONCE(querier->port_ifidx); struct net_bridge_port *p; struct net_device *dev; if (port_ifidx == 0) return NULL; dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx); if (!dev) return NULL; p = br_port_get_rtnl_rcu(dev); if (!p || p->br != br) return NULL; return p; } size_t br_multicast_querier_state_size(void) { return nla_total_size(0) + /* nest attribute */ nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IP_PORT */ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IP_OTHER_TIMER */ #if IS_ENABLED(CONFIG_IPV6) nla_total_size(sizeof(struct in6_addr)) + /* BRIDGE_QUERIER_IPV6_ADDRESS */ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IPV6_PORT */ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IPV6_OTHER_TIMER */ #endif 0; } /* protected by rtnl or rcu */ int br_multicast_dump_querier_state(struct sk_buff *skb, const struct net_bridge_mcast *brmctx, int nest_attr) { struct bridge_mcast_querier querier = {}; struct net_bridge_port *p; struct nlattr *nest; if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || br_multicast_ctx_vlan_global_disabled(brmctx)) return 0; nest = nla_nest_start(skb, nest_attr); if (!nest) return -EMSGSIZE; rcu_read_lock(); if (!brmctx->multicast_querier && !timer_pending(&brmctx->ip4_other_query.timer)) goto out_v6; br_multicast_read_querier(&brmctx->ip4_querier, &querier); if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS, querier.addr.src.ip4)) { rcu_read_unlock(); goto out_err; } p = __br_multicast_get_querier_port(brmctx->br, &querier); if (timer_pending(&brmctx->ip4_other_query.timer) && (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER, br_timer_value(&brmctx->ip4_other_query.timer), BRIDGE_QUERIER_PAD) || (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) { rcu_read_unlock(); goto out_err; } out_v6: #if IS_ENABLED(CONFIG_IPV6) if (!brmctx->multicast_querier && !timer_pending(&brmctx->ip6_other_query.timer)) goto out; br_multicast_read_querier(&brmctx->ip6_querier, &querier); if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS, &querier.addr.src.ip6)) { rcu_read_unlock(); goto out_err; } p = __br_multicast_get_querier_port(brmctx->br, &querier); if (timer_pending(&brmctx->ip6_other_query.timer) && (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER, br_timer_value(&brmctx->ip6_other_query.timer), BRIDGE_QUERIER_PAD) || (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT, p->dev->ifindex)))) { rcu_read_unlock(); goto out_err; } out: #endif rcu_read_unlock(); nla_nest_end(skb, nest); if (!nla_len(nest)) nla_nest_cancel(skb, nest); return 0; out_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void br_multicast_update_query_timer(struct net_bridge_mcast *brmctx, struct bridge_mcast_other_query *query, unsigned long max_delay) { if (!timer_pending(&query->timer)) mod_timer(&query->delay_timer, jiffies + max_delay); mod_timer(&query->timer, jiffies + brmctx->multicast_querier_interval); } static void br_port_mc_router_state_change(struct net_bridge_port *p, bool is_mc_router) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_PORT_MROUTER, .flags = SWITCHDEV_F_DEFER, .u.mrouter = is_mc_router, }; switchdev_port_attr_set(p->dev, &attr, NULL); } static struct net_bridge_port * br_multicast_rport_from_node(struct net_bridge_mcast *brmctx, struct hlist_head *mc_router_list, struct hlist_node *rlist) { struct net_bridge_mcast_port *pmctx; #if IS_ENABLED(CONFIG_IPV6) if (mc_router_list == &brmctx->ip6_mc_router_list) pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, ip6_rlist); else #endif pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, ip4_rlist); return pmctx->port; } static struct hlist_node * br_multicast_get_rport_slot(struct net_bridge_mcast *brmctx, struct net_bridge_port *port, struct hlist_head *mc_router_list) { struct hlist_node *slot = NULL; struct net_bridge_port *p; struct hlist_node *rlist; hlist_for_each(rlist, mc_router_list) { p = br_multicast_rport_from_node(brmctx, mc_router_list, rlist); if ((unsigned long)port >= (unsigned long)p) break; slot = rlist; } return slot; } static bool br_multicast_no_router_otherpf(struct net_bridge_mcast_port *pmctx, struct hlist_node *rnode) { #if IS_ENABLED(CONFIG_IPV6) if (rnode != &pmctx->ip6_rlist) return hlist_unhashed(&pmctx->ip6_rlist); else return hlist_unhashed(&pmctx->ip4_rlist); #else return true; #endif } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct hlist_node *rlist, struct hlist_head *mc_router_list) { struct hlist_node *slot; if (!hlist_unhashed(rlist)) return; slot = br_multicast_get_rport_slot(brmctx, pmctx->port, mc_router_list); if (slot) hlist_add_behind_rcu(rlist, slot); else hlist_add_head_rcu(rlist, mc_router_list); /* For backwards compatibility for now, only notify if we * switched from no IPv4/IPv6 multicast router to a new * IPv4 or IPv6 multicast router. */ if (br_multicast_no_router_otherpf(pmctx, rlist)) { br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB); br_port_mc_router_state_change(pmctx->port, true); } } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { br_multicast_add_router(brmctx, pmctx, &pmctx->ip4_rlist, &brmctx->ip4_mc_router_list); } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) br_multicast_add_router(brmctx, pmctx, &pmctx->ip6_rlist, &brmctx->ip6_mc_router_list); #endif } static void br_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct timer_list *timer, struct hlist_node *rlist, struct hlist_head *mc_router_list) { unsigned long now = jiffies; if (!br_multicast_ctx_should_use(brmctx, pmctx)) return; if (!pmctx) { if (brmctx->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { if (!br_ip4_multicast_is_router(brmctx) && !br_ip6_multicast_is_router(brmctx)) br_mc_router_state_change(brmctx->br, true); mod_timer(timer, now + brmctx->multicast_querier_interval); } return; } if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || pmctx->multicast_router == MDB_RTR_TYPE_PERM) return; br_multicast_add_router(brmctx, pmctx, rlist, mc_router_list); mod_timer(timer, now + brmctx->multicast_querier_interval); } static void br_ip4_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { struct timer_list *timer = &brmctx->ip4_mc_router_timer; struct hlist_node *rlist = NULL; if (pmctx) { timer = &pmctx->ip4_mc_router_timer; rlist = &pmctx->ip4_rlist; } br_multicast_mark_router(brmctx, pmctx, timer, rlist, &brmctx->ip4_mc_router_list); } static void br_ip6_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) struct timer_list *timer = &brmctx->ip6_mc_router_timer; struct hlist_node *rlist = NULL; if (pmctx) { timer = &pmctx->ip6_mc_router_timer; rlist = &pmctx->ip6_rlist; } br_multicast_mark_router(brmctx, pmctx, timer, rlist, &brmctx->ip6_mc_router_list); #endif } static void br_ip4_multicast_query_received(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; br_multicast_update_query_timer(brmctx, query, max_delay); br_ip4_multicast_mark_router(brmctx, pmctx); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_received(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; br_multicast_update_query_timer(brmctx, query, max_delay); br_ip6_multicast_mark_router(brmctx, pmctx); } #endif static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { unsigned int transport_len = ip_transport_len(skb); const struct iphdr *iph = ip_hdr(skb); struct igmphdr *ih = igmp_hdr(skb); struct net_bridge_mdb_entry *mp; struct igmpv3_query *ih3; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; __be32 group; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; group = ih->group; if (transport_len == sizeof(*ih)) { max_delay = ih->code * (HZ / IGMP_TIMER_SCALE); if (!max_delay) { max_delay = 10 * HZ; group = 0; } } else if (transport_len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs || (brmctx->multicast_igmp_version == 3 && group && ih3->suppress)) goto out; max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; } else { goto out; } if (!group) { saddr.proto = htons(ETH_P_IP); saddr.src.ip4 = iph->saddr; br_ip4_multicast_query_received(brmctx, pmctx, &brmctx->ip4_other_query, &saddr, max_delay); goto out; } mp = br_mdb_ip4_get(brmctx->br, group, vid); if (!mp) goto out; max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && (brmctx->multicast_igmp_version == 2 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: spin_unlock(&brmctx->br->multicast_lock); } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { unsigned int transport_len = ipv6_transport_len(skb); struct mld_msg *mld; struct net_bridge_mdb_entry *mp; struct mld2_query *mld2q; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; unsigned int offset = skb_transport_offset(skb); const struct in6_addr *group = NULL; bool is_general_query; int err = 0; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; if (transport_len == sizeof(*mld)) { if (!pskb_may_pull(skb, offset + sizeof(*mld))) { err = -EINVAL; goto out; } mld = (struct mld_msg *) icmp6_hdr(skb); max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay)); if (max_delay) group = &mld->mld_mca; } else { if (!pskb_may_pull(skb, offset + sizeof(*mld2q))) { err = -EINVAL; goto out; } mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; if (brmctx->multicast_mld_version == 2 && !ipv6_addr_any(&mld2q->mld2q_mca) && mld2q->mld2q_suppress) goto out; max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL); } is_general_query = group && ipv6_addr_any(group); if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.src.ip6 = ipv6_hdr(skb)->saddr; br_ip6_multicast_query_received(brmctx, pmctx, &brmctx->ip6_other_query, &saddr, max_delay); goto out; } else if (!group) { goto out; } mp = br_mdb_ip6_get(brmctx->br, group, vid); if (!mp) goto out; max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && (brmctx->multicast_mld_version == 1 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: spin_unlock(&brmctx->br->multicast_lock); return err; } #endif static void br_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, struct bridge_mcast_other_query *other_query, struct bridge_mcast_own_query *own_query, const unsigned char *src) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; unsigned long now; unsigned long time; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; mp = br_mdb_ip_get(brmctx->br, group); if (!mp) goto out; if (pmctx && (pmctx->port->flags & BR_MULTICAST_FAST_LEAVE)) { struct net_bridge_port_group __rcu **pp; for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (!br_port_group_equal(p, pmctx->port, src)) continue; if (p->flags & MDB_PG_FLAGS_PERMANENT) break; p->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_del_pg(mp, p, pp); } goto out; } if (timer_pending(&other_query->timer)) goto out; if (brmctx->multicast_querier) { __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &mp->addr, false, 0, NULL); time = jiffies + brmctx->multicast_last_member_count * brmctx->multicast_last_member_interval; mod_timer(&own_query->timer, time); for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL && pmctx != NULL; p = mlock_dereference(p->next, brmctx->br)) { if (!br_port_group_equal(p, pmctx->port, src)) continue; if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : try_to_del_timer_sync(&p->timer) >= 0)) { mod_timer(&p->timer, time); } break; } } now = jiffies; time = now + brmctx->multicast_last_member_count * brmctx->multicast_last_member_interval; if (!pmctx) { if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : try_to_del_timer_sync(&mp->timer) >= 0)) { mod_timer(&mp->timer, time); } goto out; } for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL; p = mlock_dereference(p->next, brmctx->br)) { if (p->key.port != pmctx->port) continue; if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : try_to_del_timer_sync(&p->timer) >= 0)) { mod_timer(&p->timer, time); } break; } out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src) { struct br_ip br_group; struct bridge_mcast_own_query *own_query; if (ipv4_is_local_multicast(group)) return; own_query = pmctx ? &pmctx->ip4_own_query : &brmctx->ip4_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; br_multicast_leave_group(brmctx, pmctx, &br_group, &brmctx->ip4_other_query, own_query, src); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src) { struct br_ip br_group; struct bridge_mcast_own_query *own_query; if (ipv6_addr_is_ll_all_nodes(group)) return; own_query = pmctx ? &pmctx->ip6_own_query : &brmctx->ip6_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; br_multicast_leave_group(brmctx, pmctx, &br_group, &brmctx->ip6_other_query, own_query, src); } #endif static void br_multicast_err_count(const struct net_bridge *br, const struct net_bridge_port *p, __be16 proto) { struct bridge_mcast_stats __percpu *stats; struct bridge_mcast_stats *pstats; if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) return; if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; pstats = this_cpu_ptr(stats); u64_stats_update_begin(&pstats->syncp); switch (proto) { case htons(ETH_P_IP): pstats->mstats.igmp_parse_errors++; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): pstats->mstats.mld_parse_errors++; break; #endif } u64_stats_update_end(&pstats->syncp); } static void br_multicast_pim(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct sk_buff *skb) { unsigned int offset = skb_transport_offset(skb); struct pimhdr *pimhdr, _pimhdr; pimhdr = skb_header_pointer(skb, offset, sizeof(_pimhdr), &_pimhdr); if (!pimhdr || pim_hdr_version(pimhdr) != PIM_VERSION || pim_hdr_type(pimhdr) != PIM_TYPE_HELLO) return; spin_lock(&brmctx->br->multicast_lock); br_ip4_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); } static int br_ip4_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (ip_hdr(skb)->protocol != IPPROTO_IGMP || igmp_hdr(skb)->type != IGMP_MRDISC_ADV) return -ENOMSG; spin_lock(&brmctx->br->multicast_lock); br_ip4_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); return 0; } static int br_multicast_ipv4_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct igmphdr *ih; int err; err = ip_mc_check_igmp(skb); if (err == -ENOMSG) { if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) { BR_INPUT_SKB_CB(skb)->mrouters_only = 1; } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) { if (ip_hdr(skb)->protocol == IPPROTO_PIM) br_multicast_pim(brmctx, pmctx, skb); } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) { br_ip4_multicast_mrd_rcv(brmctx, pmctx, skb); } return 0; } else if (err < 0) { br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } ih = igmp_hdr(skb); src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->igmp = ih->type; switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip4_multicast_add_group(brmctx, pmctx, ih->group, vid, src, true); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: err = br_ip4_multicast_igmp3_report(brmctx, pmctx, skb, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: br_ip4_multicast_query(brmctx, pmctx, skb, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(brmctx, pmctx, ih->group, vid, src); break; } br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; spin_lock(&brmctx->br->multicast_lock); br_ip6_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); } static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct mld_msg *mld; int err; err = ipv6_mc_check_mld(skb); if (err == -ENOMSG || err == -ENODATA) { if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; if (err == -ENODATA && ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) br_ip6_multicast_mrd_rcv(brmctx, pmctx, skb); return 0; } else if (err < 0) { br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } mld = (struct mld_msg *)skb_transport_header(skb); BR_INPUT_SKB_CB(skb)->igmp = mld->mld_type; switch (mld->mld_type) { case ICMPV6_MGM_REPORT: src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(brmctx, pmctx, &mld->mld_mca, vid, src, true); break; case ICMPV6_MLD2_REPORT: err = br_ip6_multicast_mld2_report(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_QUERY: err = br_ip6_multicast_query(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_REDUCTION: src = eth_hdr(skb)->h_source; br_ip6_multicast_leave_group(brmctx, pmctx, &mld->mld_mca, vid, src); break; } br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #endif int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { int ret = 0; BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED)) return 0; if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) { const struct net_bridge_vlan *masterv; /* the vlan has the master flag set only when transmitting * through the bridge device */ if (br_vlan_is_master(vlan)) { masterv = vlan; *brmctx = &vlan->br_mcast_ctx; *pmctx = NULL; } else { masterv = vlan->brvlan; *brmctx = &vlan->brvlan->br_mcast_ctx; *pmctx = &vlan->port_mcast_ctx; } if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) return 0; } switch (skb->protocol) { case htons(ETH_P_IP): ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid); break; #endif } return ret; } static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query, struct bridge_mcast_querier *querier) { spin_lock(&brmctx->br->multicast_lock); if (br_multicast_ctx_vlan_disabled(brmctx)) goto out; if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; br_multicast_send_query(brmctx, NULL, query); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_query_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip4_own_query.timer); br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, &brmctx->ip4_querier); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip6_own_query.timer); br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, &brmctx->ip6_querier); } #endif static void br_multicast_gc_work(struct work_struct *work) { struct net_bridge *br = container_of(work, struct net_bridge, mcast_gc_work); HLIST_HEAD(deleted_head); spin_lock_bh(&br->multicast_lock); hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); } void br_multicast_ctx_init(struct net_bridge *br, struct net_bridge_vlan *vlan, struct net_bridge_mcast *brmctx) { brmctx->br = br; brmctx->vlan = vlan; brmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; brmctx->multicast_last_member_count = 2; brmctx->multicast_startup_query_count = 2; brmctx->multicast_last_member_interval = HZ; brmctx->multicast_query_response_interval = 10 * HZ; brmctx->multicast_startup_query_interval = 125 * HZ / 4; brmctx->multicast_query_interval = 125 * HZ; brmctx->multicast_querier_interval = 255 * HZ; brmctx->multicast_membership_interval = 260 * HZ; brmctx->ip4_querier.port_ifidx = 0; seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock); brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) brmctx->multicast_mld_version = 1; brmctx->ip6_querier.port_ifidx = 0; seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock); #endif timer_setup(&brmctx->ip4_mc_router_timer, br_ip4_multicast_local_router_expired, 0); timer_setup(&brmctx->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); timer_setup(&brmctx->ip4_other_query.delay_timer, br_multicast_query_delay_expired, 0); timer_setup(&brmctx->ip4_own_query.timer, br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) timer_setup(&brmctx->ip6_mc_router_timer, br_ip6_multicast_local_router_expired, 0); timer_setup(&brmctx->ip6_other_query.timer, br_ip6_multicast_querier_expired, 0); timer_setup(&brmctx->ip6_other_query.delay_timer, br_multicast_query_delay_expired, 0); timer_setup(&brmctx->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif } void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) { __br_multicast_stop(brmctx); } void br_multicast_init(struct net_bridge *br) { br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; br_multicast_ctx_init(br, NULL, &br->multicast_ctx); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); spin_lock_init(&br->multicast_lock); INIT_HLIST_HEAD(&br->mdb_list); INIT_HLIST_HEAD(&br->mcast_gc_list); INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work); } static void br_ip4_multicast_join_snoopers(struct net_bridge *br) { struct in_device *in_dev = in_dev_get(br->dev); if (!in_dev) return; __ip_mc_inc_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC); in_dev_put(in_dev); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_join_snoopers(struct net_bridge *br) { struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); ipv6_dev_mc_inc(br->dev, &addr); } #else static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br) { } #endif void br_multicast_join_snoopers(struct net_bridge *br) { br_ip4_multicast_join_snoopers(br); br_ip6_multicast_join_snoopers(br); } static void br_ip4_multicast_leave_snoopers(struct net_bridge *br) { struct in_device *in_dev = in_dev_get(br->dev); if (WARN_ON(!in_dev)) return; __ip_mc_dec_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC); in_dev_put(in_dev); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_snoopers(struct net_bridge *br) { struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); ipv6_dev_mc_dec(br->dev, &addr); } #else static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br) { } #endif void br_multicast_leave_snoopers(struct net_bridge *br) { br_ip4_multicast_leave_snoopers(br); br_ip6_multicast_leave_snoopers(br); } static void __br_multicast_open_query(struct net_bridge *br, struct bridge_mcast_own_query *query) { query->startup_sent = 0; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return; mod_timer(&query->timer, jiffies); } static void __br_multicast_open(struct net_bridge_mcast *brmctx) { __br_multicast_open_query(brmctx->br, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) __br_multicast_open_query(brmctx->br, &brmctx->ip6_own_query); #endif } void br_multicast_open(struct net_bridge *br) { ASSERT_RTNL(); if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; vg = br_vlan_group(br); if (vg) { list_for_each_entry(vlan, &vg->vlan_list, vlist) { struct net_bridge_mcast *brmctx; brmctx = &vlan->br_mcast_ctx; if (br_vlan_is_brentry(vlan) && !br_multicast_ctx_vlan_disabled(brmctx)) __br_multicast_open(&vlan->br_mcast_ctx); } } } else { __br_multicast_open(&br->multicast_ctx); } } static void __br_multicast_stop(struct net_bridge_mcast *brmctx) { del_timer_sync(&brmctx->ip4_mc_router_timer); del_timer_sync(&brmctx->ip4_other_query.timer); del_timer_sync(&brmctx->ip4_other_query.delay_timer); del_timer_sync(&brmctx->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) del_timer_sync(&brmctx->ip6_mc_router_timer); del_timer_sync(&brmctx->ip6_other_query.timer); del_timer_sync(&brmctx->ip6_other_query.delay_timer); del_timer_sync(&brmctx->ip6_own_query.timer); #endif } void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge *br; /* it's okay to check for the flag without the multicast lock because it * can only change under RTNL -> multicast_lock, we need the latter to * sync with timers and packets */ if (on == !!(vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) return; if (br_vlan_is_master(vlan)) { br = vlan->br; if (!br_vlan_is_brentry(vlan) || (on && br_multicast_ctx_vlan_global_disabled(&vlan->br_mcast_ctx))) return; spin_lock_bh(&br->multicast_lock); vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; spin_unlock_bh(&br->multicast_lock); if (on) __br_multicast_open(&vlan->br_mcast_ctx); else __br_multicast_stop(&vlan->br_mcast_ctx); } else { struct net_bridge_mcast *brmctx; brmctx = br_multicast_port_ctx_get_global(&vlan->port_mcast_ctx); if (on && br_multicast_ctx_vlan_global_disabled(brmctx)) return; br = vlan->port->br; spin_lock_bh(&br->multicast_lock); vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; if (on) __br_multicast_enable_port_ctx(&vlan->port_mcast_ctx); else __br_multicast_disable_port_ctx(&vlan->port_mcast_ctx); spin_unlock_bh(&br->multicast_lock); } } static void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge_port *p; if (WARN_ON_ONCE(!br_vlan_is_master(vlan))) return; list_for_each_entry(p, &vlan->br->port_list, list) { struct net_bridge_vlan *vport; vport = br_vlan_find(nbp_vlan_group(p), vlan->vid); if (!vport) continue; br_multicast_toggle_one_vlan(vport, on); } if (br_vlan_is_brentry(vlan)) br_multicast_toggle_one_vlan(vlan, on); } int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; struct net_bridge_port *p; if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on) return 0; if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled"); return -EINVAL; } vg = br_vlan_group(br); if (!vg) return 0; br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on); /* disable/enable non-vlan mcast contexts based on vlan snooping */ if (on) __br_multicast_stop(&br->multicast_ctx); else __br_multicast_open(&br->multicast_ctx); list_for_each_entry(p, &br->port_list, list) { if (on) br_multicast_disable_port(p); else br_multicast_enable_port(p); } list_for_each_entry(vlan, &vg->vlan_list, vlist) br_multicast_toggle_vlan(vlan, on); return 0; } bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on) { ASSERT_RTNL(); /* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and * requires only RTNL to change */ if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) return false; vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED; br_multicast_toggle_vlan(vlan, on); return true; } void br_multicast_stop(struct net_bridge *br) { ASSERT_RTNL(); if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; vg = br_vlan_group(br); if (vg) { list_for_each_entry(vlan, &vg->vlan_list, vlist) { struct net_bridge_mcast *brmctx; brmctx = &vlan->br_mcast_ctx; if (br_vlan_is_brentry(vlan) && !br_multicast_ctx_vlan_disabled(brmctx)) __br_multicast_stop(&vlan->br_mcast_ctx); } } } else { __br_multicast_stop(&br->multicast_ctx); } } void br_multicast_dev_del(struct net_bridge *br) { struct net_bridge_mdb_entry *mp; HLIST_HEAD(deleted_head); struct hlist_node *tmp; spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) br_multicast_del_mdb_entry(mp); hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_ctx_deinit(&br->multicast_ctx); br_multicast_gc(&deleted_head); cancel_work_sync(&br->mcast_gc_work); rcu_barrier(); } int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val) { int err = -EINVAL; spin_lock_bh(&brmctx->br->multicast_lock); switch (val) { case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM); del_timer(&brmctx->ip4_mc_router_timer); #if IS_ENABLED(CONFIG_IPV6) del_timer(&brmctx->ip6_mc_router_timer); #endif brmctx->multicast_router = val; err = 0; break; case MDB_RTR_TYPE_TEMP_QUERY: if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) br_mc_router_state_change(brmctx->br, false); brmctx->multicast_router = val; err = 0; break; } spin_unlock_bh(&brmctx->br->multicast_lock); return err; } static void br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted) { if (!deleted) return; /* For backwards compatibility for now, only notify if there is * no multicast router anymore for both IPv4 and IPv6. */ if (!hlist_unhashed(&pmctx->ip4_rlist)) return; #if IS_ENABLED(CONFIG_IPV6) if (!hlist_unhashed(&pmctx->ip6_rlist)) return; #endif br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB); br_port_mc_router_state_change(pmctx->port, false); /* don't allow timer refresh */ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, unsigned long val) { struct net_bridge_mcast *brmctx; unsigned long now = jiffies; int err = -EINVAL; bool del = false; brmctx = br_multicast_port_ctx_get_global(pmctx); spin_lock_bh(&brmctx->br->multicast_lock); if (pmctx->multicast_router == val) { /* Refresh the temp router port timer */ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) { mod_timer(&pmctx->ip4_mc_router_timer, now + brmctx->multicast_querier_interval); #if IS_ENABLED(CONFIG_IPV6) mod_timer(&pmctx->ip6_mc_router_timer, now + brmctx->multicast_querier_interval); #endif } err = 0; goto unlock; } switch (val) { case MDB_RTR_TYPE_DISABLED: pmctx->multicast_router = MDB_RTR_TYPE_DISABLED; del |= br_ip4_multicast_rport_del(pmctx); del_timer(&pmctx->ip4_mc_router_timer); del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) del_timer(&pmctx->ip6_mc_router_timer); #endif br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_TEMP_QUERY: pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; del |= br_ip4_multicast_rport_del(pmctx); del |= br_ip6_multicast_rport_del(pmctx); br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_PERM: pmctx->multicast_router = MDB_RTR_TYPE_PERM; del_timer(&pmctx->ip4_mc_router_timer); br_ip4_multicast_add_router(brmctx, pmctx); #if IS_ENABLED(CONFIG_IPV6) del_timer(&pmctx->ip6_mc_router_timer); #endif br_ip6_multicast_add_router(brmctx, pmctx); break; case MDB_RTR_TYPE_TEMP: pmctx->multicast_router = MDB_RTR_TYPE_TEMP; br_ip4_multicast_mark_router(brmctx, pmctx); br_ip6_multicast_mark_router(brmctx, pmctx); break; default: goto unlock; } err = 0; unlock: spin_unlock_bh(&brmctx->br->multicast_lock); return err; } int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router) { int err; if (br_vlan_is_master(v)) err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router); else err = br_multicast_set_port_router(&v->port_mcast_ctx, mcast_router); return err; } static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { struct net_bridge_port *port; if (!br_multicast_ctx_matches_vlan_snooping(brmctx)) return; __br_multicast_open_query(brmctx->br, query); rcu_read_lock(); list_for_each_entry_rcu(port, &brmctx->br->port_list, list) { struct bridge_mcast_own_query *ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query *ip6_own_query; #endif if (br_multicast_port_ctx_state_stopped(&port->multicast_ctx)) continue; if (br_multicast_ctx_is_vlan(brmctx)) { struct net_bridge_vlan *vlan; vlan = br_vlan_find(nbp_vlan_group_rcu(port), brmctx->vlan->vid); if (!vlan || br_multicast_port_ctx_state_stopped(&vlan->port_mcast_ctx)) continue; ip4_own_query = &vlan->port_mcast_ctx.ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) ip6_own_query = &vlan->port_mcast_ctx.ip6_own_query; #endif } else { ip4_own_query = &port->multicast_ctx.ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) ip6_own_query = &port->multicast_ctx.ip6_own_query; #endif } if (query == &brmctx->ip4_own_query) br_multicast_enable(ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) else br_multicast_enable(ip6_own_query); #endif } rcu_read_unlock(); } int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_port *port; bool change_snoopers = false; int err = 0; spin_lock_bh(&br->multicast_lock); if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val) goto unlock; err = br_mc_disabled_update(br->dev, val, extack); if (err == -EOPNOTSUPP) err = 0; if (err) goto unlock; br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { change_snoopers = true; goto unlock; } if (!netif_running(br->dev)) goto unlock; br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) __br_multicast_enable_port_ctx(&port->multicast_ctx); change_snoopers = true; unlock: spin_unlock_bh(&br->multicast_lock); /* br_multicast_join_snoopers has the potential to cause * an MLD Report/Leave to be delivered to br_multicast_rcv, * which would in turn call br_multicast_add_group, which would * attempt to acquire multicast_lock. This function should be * called after the lock has been released to avoid deadlocks on * multicast_lock. * * br_multicast_leave_snoopers does not have the problem since * br_multicast_rcv first checks BROPT_MULTICAST_ENABLED, and * returns without calling br_multicast_ipv4/6_rcv if it's not * enabled. Moved both functions out just for symmetry. */ if (change_snoopers) { if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) br_multicast_join_snoopers(br); else br_multicast_leave_snoopers(br); } return err; } bool br_multicast_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); return !!br_opt_get(br, BROPT_MULTICAST_ENABLED); } EXPORT_SYMBOL_GPL(br_multicast_enabled); bool br_multicast_router(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); bool is_router; spin_lock_bh(&br->multicast_lock); is_router = br_multicast_is_router(&br->multicast_ctx, NULL); spin_unlock_bh(&br->multicast_lock); return is_router; } EXPORT_SYMBOL_GPL(br_multicast_router); int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long max_delay; val = !!val; spin_lock_bh(&brmctx->br->multicast_lock); if (brmctx->multicast_querier == val) goto unlock; WRITE_ONCE(brmctx->multicast_querier, val); if (!val) goto unlock; max_delay = brmctx->multicast_query_response_interval; if (!timer_pending(&brmctx->ip4_other_query.timer)) mod_timer(&brmctx->ip4_other_query.delay_timer, jiffies + max_delay); br_multicast_start_querier(brmctx, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) if (!timer_pending(&brmctx->ip6_other_query.timer)) mod_timer(&brmctx->ip6_other_query.delay_timer, jiffies + max_delay); br_multicast_start_querier(brmctx, &brmctx->ip6_own_query); #endif unlock: spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, unsigned long val) { /* Currently we support only version 2 and 3 */ switch (val) { case 2: case 3: break; default: return -EINVAL; } spin_lock_bh(&brmctx->br->multicast_lock); brmctx->multicast_igmp_version = val; spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } #if IS_ENABLED(CONFIG_IPV6) int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, unsigned long val) { /* Currently we support version 1 and 2 */ switch (val) { case 1: case 2: break; default: return -EINVAL; } spin_lock_bh(&brmctx->br->multicast_lock); brmctx->multicast_mld_version = val; spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } #endif void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long intvl_jiffies = clock_t_to_jiffies(val); if (intvl_jiffies < BR_MULTICAST_QUERY_INTVL_MIN) { br_info(brmctx->br, "trying to set multicast query interval below minimum, setting to %lu (%ums)\n", jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MIN), jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MIN)); intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN; } brmctx->multicast_query_interval = intvl_jiffies; } void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long intvl_jiffies = clock_t_to_jiffies(val); if (intvl_jiffies < BR_MULTICAST_STARTUP_QUERY_INTVL_MIN) { br_info(brmctx->br, "trying to set multicast startup query interval below minimum, setting to %lu (%ums)\n", jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN), jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN)); intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN; } brmctx->multicast_startup_query_interval = intvl_jiffies; } /** * br_multicast_list_adjacent - Returns snooped multicast addresses * @dev: The bridge port adjacent to which to retrieve addresses * @br_ip_list: The list to store found, snooped multicast IP addresses in * * Creates a list of IP addresses (struct br_ip_list) sensed by the multicast * snooping feature on all bridge ports of dev's bridge device, excluding * the addresses from dev itself. * * Returns the number of items added to br_ip_list. * * Notes: * - br_ip_list needs to be initialized by caller * - br_ip_list might contain duplicates in the end * (needs to be taken care of by caller) * - br_ip_list needs to be freed by caller */ int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) { struct net_bridge *br; struct net_bridge_port *port; struct net_bridge_port_group *group; struct br_ip_list *entry; int count = 0; rcu_read_lock(); if (!br_ip_list || !netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; list_for_each_entry_rcu(port, &br->port_list, list) { if (!port->dev || port->dev == dev) continue; hlist_for_each_entry_rcu(group, &port->mglist, mglist) { entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) goto unlock; entry->addr = group->key.addr; list_add(&entry->list, br_ip_list); count++; } } unlock: rcu_read_unlock(); return count; } EXPORT_SYMBOL_GPL(br_multicast_list_adjacent); /** * br_multicast_has_querier_anywhere - Checks for a querier on a bridge * @dev: The bridge port providing the bridge on which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a valid querier exists anywhere on the bridged link layer. * Otherwise returns false. */ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) { struct net_bridge *br; struct net_bridge_port *port; struct ethhdr eth; bool ret = false; rcu_read_lock(); if (!netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; memset(&eth, 0, sizeof(eth)); eth.h_proto = htons(proto); ret = br_multicast_querier_exists(&br->multicast_ctx, &eth, NULL); unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); /** * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port * @dev: The bridge port adjacent to which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a selected querier is behind one of the other ports of this * bridge. Otherwise returns false. */ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) { struct net_bridge_mcast *brmctx; struct net_bridge *br; struct net_bridge_port *port; bool ret = false; int port_ifidx; rcu_read_lock(); if (!netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; brmctx = &br->multicast_ctx; switch (proto) { case ETH_P_IP: port_ifidx = brmctx->ip4_querier.port_ifidx; if (!timer_pending(&brmctx->ip4_other_query.timer) || port_ifidx == port->dev->ifindex) goto unlock; break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: port_ifidx = brmctx->ip6_querier.port_ifidx; if (!timer_pending(&brmctx->ip6_other_query.timer) || port_ifidx == port->dev->ifindex) goto unlock; break; #endif default: goto unlock; } ret = true; unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); /** * br_multicast_has_router_adjacent - Checks for a router behind a bridge port * @dev: The bridge port adjacent to which to check for a multicast router * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a multicast router is behind one of the other ports of this * bridge. Otherwise returns false. */ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) { struct net_bridge_mcast_port *pmctx; struct net_bridge_mcast *brmctx; struct net_bridge_port *port; bool ret = false; rcu_read_lock(); port = br_port_get_check_rcu(dev); if (!port) goto unlock; brmctx = &port->br->multicast_ctx; switch (proto) { case ETH_P_IP: hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) { if (pmctx->port == port) continue; ret = true; goto unlock; } break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) { if (pmctx->port == port) continue; ret = true; goto unlock; } break; #endif default: /* when compiled without IPv6 support, be conservative and * always assume presence of an IPv6 multicast router */ ret = true; } unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent); static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats *pstats = this_cpu_ptr(stats); __be16 proto = skb->protocol; unsigned int t_len; u64_stats_update_begin(&pstats->syncp); switch (proto) { case htons(ETH_P_IP): t_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); switch (type) { case IGMP_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v1reports[dir]++; break; case IGMPV2_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v2reports[dir]++; break; case IGMPV3_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v3reports[dir]++; break; case IGMP_HOST_MEMBERSHIP_QUERY: if (t_len != sizeof(struct igmphdr)) { pstats->mstats.igmp_v3queries[dir]++; } else { unsigned int offset = skb_transport_offset(skb); struct igmphdr *ih, _ihdr; ih = skb_header_pointer(skb, offset, sizeof(_ihdr), &_ihdr); if (!ih) break; if (!ih->code) pstats->mstats.igmp_v1queries[dir]++; else pstats->mstats.igmp_v2queries[dir]++; } break; case IGMP_HOST_LEAVE_MESSAGE: pstats->mstats.igmp_leaves[dir]++; break; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): t_len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); t_len -= skb_network_header_len(skb); switch (type) { case ICMPV6_MGM_REPORT: pstats->mstats.mld_v1reports[dir]++; break; case ICMPV6_MLD2_REPORT: pstats->mstats.mld_v2reports[dir]++; break; case ICMPV6_MGM_QUERY: if (t_len != sizeof(struct mld_msg)) pstats->mstats.mld_v2queries[dir]++; else pstats->mstats.mld_v1queries[dir]++; break; case ICMPV6_MGM_REDUCTION: pstats->mstats.mld_leaves[dir]++; break; } break; #endif /* CONFIG_IPV6 */ } u64_stats_update_end(&pstats->syncp); } void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats __percpu *stats; /* if multicast_disabled is true then igmp type can't be set */ if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) return; if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; br_mcast_stats_add(stats, skb, type, dir); } int br_multicast_init_stats(struct net_bridge *br) { br->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!br->mcast_stats) return -ENOMEM; return 0; } void br_multicast_uninit_stats(struct net_bridge *br) { free_percpu(br->mcast_stats); } /* noinline for https://llvm.org/pr45802#c9 */ static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src) { dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX]; } void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest) { struct bridge_mcast_stats __percpu *stats; struct br_mcast_stats tdst; int i; memset(dest, 0, sizeof(*dest)); if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; memset(&tdst, 0, sizeof(tdst)); for_each_possible_cpu(i) { struct bridge_mcast_stats *cpu_stats = per_cpu_ptr(stats, i); struct br_mcast_stats temp; unsigned int start; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); mcast_stats_add_dir(tdst.igmp_v3queries, temp.igmp_v3queries); mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves); mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports); mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports); mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports); tdst.igmp_parse_errors += temp.igmp_parse_errors; mcast_stats_add_dir(tdst.mld_v1queries, temp.mld_v1queries); mcast_stats_add_dir(tdst.mld_v2queries, temp.mld_v2queries); mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves); mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports); mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports); tdst.mld_parse_errors += temp.mld_parse_errors; } memcpy(dest, &tdst, sizeof(*dest)); } int br_mdb_hash_init(struct net_bridge *br) { int err; err = rhashtable_init(&br->sg_port_tbl, &br_sg_port_rht_params); if (err) return err; err = rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params); if (err) { rhashtable_destroy(&br->sg_port_tbl); return err; } return 0; } void br_mdb_hash_fini(struct net_bridge *br) { rhashtable_destroy(&br->sg_port_tbl); rhashtable_destroy(&br->mdb_hash_tbl); }
12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 10 10 9 22 22 22 5 5 7 7 7 7 7 7 7 7 7 7 9 42 42 2 24 24 13 21 13 9 9 9 9 9 9 9 9 9 9 9 9 9 9 1041 1000 68 48 9 9 7 24 24 24 24 20 1 14 10 20 5 5 22 66 66 27 66 60 60 60 60 13 24 24 7 12 15 15 35 35 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 // SPDX-License-Identifier: GPL-2.0-only /* * This is the linux wireless configuration interface. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> #include <linux/module.h> #include <linux/err.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/nl80211.h> #include <linux/debugfs.h> #include <linux/notifier.h> #include <linux/device.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/sched.h> #include <net/genetlink.h> #include <net/cfg80211.h> #include "nl80211.h" #include "core.h" #include "sysfs.h" #include "debugfs.h" #include "wext-compat.h" #include "rdev-ops.h" /* name for sysfs, %d is appended */ #define PHY_NAME "phy" MODULE_AUTHOR("Johannes Berg"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("wireless configuration support"); MODULE_ALIAS_GENL_FAMILY(NL80211_GENL_NAME); /* RCU-protected (and RTNL for writers) */ LIST_HEAD(cfg80211_rdev_list); int cfg80211_rdev_list_generation; /* for debugfs */ static struct dentry *ieee80211_debugfs_dir; /* for the cleanup, scan and event works */ struct workqueue_struct *cfg80211_wq; static bool cfg80211_disable_40mhz_24ghz; module_param(cfg80211_disable_40mhz_24ghz, bool, 0644); MODULE_PARM_DESC(cfg80211_disable_40mhz_24ghz, "Disable 40MHz support in the 2.4GHz band"); struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx) { struct cfg80211_registered_device *result = NULL, *rdev; ASSERT_RTNL(); for_each_rdev(rdev) { if (rdev->wiphy_idx == wiphy_idx) { result = rdev; break; } } return result; } int get_wiphy_idx(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); return rdev->wiphy_idx; } struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx) { struct cfg80211_registered_device *rdev; ASSERT_RTNL(); rdev = cfg80211_rdev_by_wiphy_idx(wiphy_idx); if (!rdev) return NULL; return &rdev->wiphy; } static int cfg80211_dev_check_name(struct cfg80211_registered_device *rdev, const char *newname) { struct cfg80211_registered_device *rdev2; int wiphy_idx, taken = -1, digits; ASSERT_RTNL(); if (strlen(newname) > NL80211_WIPHY_NAME_MAXLEN) return -EINVAL; /* prohibit calling the thing phy%d when %d is not its number */ sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken); if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) { /* count number of places needed to print wiphy_idx */ digits = 1; while (wiphy_idx /= 10) digits++; /* * deny the name if it is phy<idx> where <idx> is printed * without leading zeroes. taken == strlen(newname) here */ if (taken == strlen(PHY_NAME) + digits) return -EINVAL; } /* Ensure another device does not already have this name. */ for_each_rdev(rdev2) if (strcmp(newname, wiphy_name(&rdev2->wiphy)) == 0) return -EINVAL; return 0; } int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname) { int result; ASSERT_RTNL(); lockdep_assert_wiphy(&rdev->wiphy); /* Ignore nop renames */ if (strcmp(newname, wiphy_name(&rdev->wiphy)) == 0) return 0; result = cfg80211_dev_check_name(rdev, newname); if (result < 0) return result; result = device_rename(&rdev->wiphy.dev, newname); if (result) return result; if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir)) debugfs_rename(rdev->wiphy.debugfsdir->d_parent, rdev->wiphy.debugfsdir, rdev->wiphy.debugfsdir->d_parent, newname); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); return 0; } int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, struct net *net) { struct wireless_dev *wdev; int err = 0; if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK)) return -EOPNOTSUPP; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; wdev->netdev->netns_local = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; wdev->netdev->netns_local = true; } if (err) { /* failed -- clean up to old netns */ net = wiphy_net(&rdev->wiphy); list_for_each_entry_continue_reverse(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; wdev->netdev->netns_local = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); WARN_ON(err); wdev->netdev->netns_local = true; } return err; } wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); } nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); wiphy_net_set(&rdev->wiphy, net); err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev)); WARN_ON(err); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } wiphy_unlock(&rdev->wiphy); return 0; } static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) { struct cfg80211_registered_device *rdev = data; wiphy_lock(&rdev->wiphy); rdev_rfkill_poll(rdev); wiphy_unlock(&rdev->wiphy); } void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)) return; if (!wdev_running(wdev)) return; rdev_stop_p2p_device(rdev, wdev); wdev->is_running = false; rdev->opencount--; if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } } void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { lockdep_assert_held(&rdev->wiphy.mtx); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN)) return; if (!wdev_running(wdev)) return; rdev_stop_nan(rdev, wdev); wdev->is_running = false; rdev->opencount--; } void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct wireless_dev *wdev; ASSERT_RTNL(); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->netdev) { dev_close(wdev->netdev); continue; } /* otherwise, check iftype */ wiphy_lock(wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); break; case NL80211_IFTYPE_NAN: cfg80211_stop_nan(rdev, wdev); break; default: break; } wiphy_unlock(wiphy); } } EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces); static int cfg80211_rfkill_set_block(void *data, bool blocked) { struct cfg80211_registered_device *rdev = data; if (!blocked) return 0; rtnl_lock(); cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return 0; } static void cfg80211_rfkill_block_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, rfkill_block); cfg80211_rfkill_set_block(rdev, true); } static void cfg80211_event_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, event_work); wiphy_lock(&rdev->wiphy); cfg80211_process_rdev_events(rdev); wiphy_unlock(&rdev->wiphy); } void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev, *tmp; ASSERT_RTNL(); list_for_each_entry_safe(wdev, tmp, &rdev->wiphy.wdev_list, list) { if (wdev->nl_owner_dead) { if (wdev->netdev) dev_close(wdev->netdev); wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); cfg80211_remove_virtual_intf(rdev, wdev); wiphy_unlock(&rdev->wiphy); } } } static void cfg80211_destroy_iface_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, destroy_work); rtnl_lock(); cfg80211_destroy_ifaces(rdev); rtnl_unlock(); } static void cfg80211_sched_scan_stop_wk(struct wiphy *wiphy, struct wiphy_work *work) { struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *req, *tmp; rdev = container_of(work, struct cfg80211_registered_device, sched_scan_stop_wk); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->nl_owner_dead) cfg80211_stop_sched_scan_req(rdev, req, false); } } static void cfg80211_propagate_radar_detect_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, propagate_radar_detect_wk); rtnl_lock(); regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->radar_chandef, NL80211_DFS_UNAVAILABLE, NL80211_RADAR_DETECTED); rtnl_unlock(); } static void cfg80211_propagate_cac_done_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, propagate_cac_done_wk); rtnl_lock(); regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->cac_done_chandef, NL80211_DFS_AVAILABLE, NL80211_RADAR_CAC_FINISHED); rtnl_unlock(); } static void cfg80211_wiphy_work(struct work_struct *work) { struct cfg80211_registered_device *rdev; struct wiphy_work *wk; rdev = container_of(work, struct cfg80211_registered_device, wiphy_work); trace_wiphy_work_worker_start(&rdev->wiphy); wiphy_lock(&rdev->wiphy); if (rdev->suspended) goto out; spin_lock_irq(&rdev->wiphy_work_lock); wk = list_first_entry_or_null(&rdev->wiphy_work_list, struct wiphy_work, entry); if (wk) { list_del_init(&wk->entry); if (!list_empty(&rdev->wiphy_work_list)) queue_work(system_unbound_wq, work); spin_unlock_irq(&rdev->wiphy_work_lock); trace_wiphy_work_run(&rdev->wiphy, wk); wk->func(&rdev->wiphy, wk); } else { spin_unlock_irq(&rdev->wiphy_work_lock); } out: wiphy_unlock(&rdev->wiphy); } /* exported functions */ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, const char *requested_name) { static atomic_t wiphy_counter = ATOMIC_INIT(0); struct cfg80211_registered_device *rdev; int alloc_size; WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key)); WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc)); WARN_ON(ops->connect && !ops->disconnect); WARN_ON(ops->join_ibss && !ops->leave_ibss); WARN_ON(ops->add_virtual_intf && !ops->del_virtual_intf); WARN_ON(ops->add_station && !ops->del_station); WARN_ON(ops->add_mpath && !ops->del_mpath); WARN_ON(ops->join_mesh && !ops->leave_mesh); WARN_ON(ops->start_p2p_device && !ops->stop_p2p_device); WARN_ON(ops->start_ap && !ops->stop_ap); WARN_ON(ops->join_ocb && !ops->leave_ocb); WARN_ON(ops->suspend && !ops->resume); WARN_ON(ops->sched_scan_start && !ops->sched_scan_stop); WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel); WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch); WARN_ON(ops->add_tx_ts && !ops->del_tx_ts); alloc_size = sizeof(*rdev) + sizeof_priv; rdev = kzalloc(alloc_size, GFP_KERNEL); if (!rdev) return NULL; rdev->ops = ops; rdev->wiphy_idx = atomic_inc_return(&wiphy_counter); if (unlikely(rdev->wiphy_idx < 0)) { /* ugh, wrapped! */ atomic_dec(&wiphy_counter); kfree(rdev); return NULL; } /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wiphy_idx--; /* give it a proper name */ if (requested_name && requested_name[0]) { int rv; rtnl_lock(); rv = cfg80211_dev_check_name(rdev, requested_name); if (rv < 0) { rtnl_unlock(); goto use_default_name; } rv = dev_set_name(&rdev->wiphy.dev, "%s", requested_name); rtnl_unlock(); if (rv) goto use_default_name; } else { int rv; use_default_name: /* NOTE: This is *probably* safe w/out holding rtnl because of * the restrictions on phy names. Probably this call could * fail if some other part of the kernel (re)named a device * phyX. But, might should add some locking and check return * value, and use a different name if this one exists? */ rv = dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); if (rv < 0) { kfree(rdev); return NULL; } } mutex_init(&rdev->wiphy.mtx); INIT_LIST_HEAD(&rdev->wiphy.wdev_list); INIT_LIST_HEAD(&rdev->beacon_registrations); spin_lock_init(&rdev->beacon_registrations_lock); spin_lock_init(&rdev->bss_lock); INIT_LIST_HEAD(&rdev->bss_list); INIT_LIST_HEAD(&rdev->sched_scan_req_list); wiphy_work_init(&rdev->scan_done_wk, __cfg80211_scan_done); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT rdev->wiphy.wext = &cfg80211_wext_handler; #endif device_initialize(&rdev->wiphy.dev); rdev->wiphy.dev.class = &ieee80211_class; rdev->wiphy.dev.platform_data = rdev; device_enable_async_suspend(&rdev->wiphy.dev); INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk); wiphy_work_init(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk); INIT_WORK(&rdev->sched_scan_res_wk, cfg80211_sched_scan_results_wk); INIT_WORK(&rdev->propagate_radar_detect_wk, cfg80211_propagate_radar_detect_wk); INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); INIT_WORK(&rdev->mgmt_registrations_update_wk, cfg80211_mgmt_registrations_update_wk); spin_lock_init(&rdev->mgmt_registrations_lock); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; #endif wiphy_net_set(&rdev->wiphy, &init_net); rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block; rdev->wiphy.rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev), &rdev->wiphy.dev, RFKILL_TYPE_WLAN, &rdev->rfkill_ops, rdev); if (!rdev->wiphy.rfkill) { wiphy_free(&rdev->wiphy); return NULL; } INIT_WORK(&rdev->wiphy_work, cfg80211_wiphy_work); INIT_LIST_HEAD(&rdev->wiphy_work_list); spin_lock_init(&rdev->wiphy_work_lock); INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work); INIT_WORK(&rdev->conn_work, cfg80211_conn_work); INIT_WORK(&rdev->event_work, cfg80211_event_work); INIT_WORK(&rdev->background_cac_abort_wk, cfg80211_background_cac_abort_wk); INIT_DELAYED_WORK(&rdev->background_cac_done_wk, cfg80211_background_cac_done_wk); init_waitqueue_head(&rdev->dev_wait); /* * Initialize wiphy parameters to IEEE 802.11 MIB default values. * Fragmentation and RTS threshold are disabled by default with the * special -1 value. */ rdev->wiphy.retry_short = 7; rdev->wiphy.retry_long = 4; rdev->wiphy.frag_threshold = (u32) -1; rdev->wiphy.rts_threshold = (u32) -1; rdev->wiphy.coverage_class = 0; rdev->wiphy.max_num_csa_counters = 1; rdev->wiphy.max_sched_scan_plans = 1; rdev->wiphy.max_sched_scan_plan_interval = U32_MAX; return &rdev->wiphy; } EXPORT_SYMBOL(wiphy_new_nm); static int wiphy_verify_iface_combinations(struct wiphy *wiphy, const struct ieee80211_iface_combination *iface_comb, int n_iface_comb, bool combined_radio) { const struct ieee80211_iface_combination *c; int i, j; for (i = 0; i < n_iface_comb; i++) { u32 cnt = 0; u16 all_iftypes = 0; c = &iface_comb[i]; /* * Combinations with just one interface aren't real, * however we make an exception for DFS. */ if (WARN_ON((c->max_interfaces < 2) && !c->radar_detect_widths)) return -EINVAL; /* Need at least one channel */ if (WARN_ON(!c->num_different_channels)) return -EINVAL; /* DFS only works on one channel. Avoid this check * for multi-radio global combination, since it hold * the capabilities of all radio combinations. */ if (!combined_radio && WARN_ON(c->radar_detect_widths && c->num_different_channels > 1)) return -EINVAL; if (WARN_ON(!c->n_limits)) return -EINVAL; for (j = 0; j < c->n_limits; j++) { u16 types = c->limits[j].types; /* interface types shouldn't overlap */ if (WARN_ON(types & all_iftypes)) return -EINVAL; all_iftypes |= types; if (WARN_ON(!c->limits[j].max)) return -EINVAL; /* Shouldn't list software iftypes in combinations! */ if (WARN_ON(wiphy->software_iftypes & types)) return -EINVAL; /* Only a single P2P_DEVICE can be allowed, avoid this * check for multi-radio global combination, since it * hold the capabilities of all radio combinations. */ if (!combined_radio && WARN_ON(types & BIT(NL80211_IFTYPE_P2P_DEVICE) && c->limits[j].max > 1)) return -EINVAL; /* Only a single NAN can be allowed, avoid this * check for multi-radio global combination, since it * hold the capabilities of all radio combinations. */ if (!combined_radio && WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && c->limits[j].max > 1)) return -EINVAL; /* * This isn't well-defined right now. If you have an * IBSS interface, then its beacon interval may change * by joining other networks, and nothing prevents it * from doing that. * So technically we probably shouldn't even allow AP * and IBSS in the same interface, but it seems that * some drivers support that, possibly only with fixed * beacon intervals for IBSS. */ if (WARN_ON(types & BIT(NL80211_IFTYPE_ADHOC) && c->beacon_int_min_gcd)) { return -EINVAL; } cnt += c->limits[j].max; /* * Don't advertise an unsupported type * in a combination. */ if (WARN_ON((wiphy->interface_modes & types) != types)) return -EINVAL; } if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; /* You can't even choose that many! */ if (WARN_ON(cnt < c->max_interfaces)) return -EINVAL; } return 0; } static int wiphy_verify_combinations(struct wiphy *wiphy) { int i, ret; bool combined_radio = false; if (wiphy->n_radio) { for (i = 0; i < wiphy->n_radio; i++) { const struct wiphy_radio *radio = &wiphy->radio[i]; ret = wiphy_verify_iface_combinations(wiphy, radio->iface_combinations, radio->n_iface_combinations, false); if (ret) return ret; } combined_radio = true; } ret = wiphy_verify_iface_combinations(wiphy, wiphy->iface_combinations, wiphy->n_iface_combinations, combined_radio); return ret; } int wiphy_register(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); int res; enum nl80211_band band; struct ieee80211_supported_band *sband; bool have_band = false; int i; u16 ifmodes = wiphy->interface_modes; #ifdef CONFIG_PM if (WARN_ON(wiphy->wowlan && (wiphy->wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && !(wiphy->wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY))) return -EINVAL; if (WARN_ON(wiphy->wowlan && !wiphy->wowlan->flags && !wiphy->wowlan->n_patterns && !wiphy->wowlan->tcp)) return -EINVAL; #endif if (WARN_ON((wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH) && (!rdev->ops->tdls_channel_switch || !rdev->ops->tdls_cancel_channel_switch))) return -EINVAL; if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && (!rdev->ops->start_nan || !rdev->ops->stop_nan || !rdev->ops->add_nan_func || !rdev->ops->del_nan_func || !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ))))) return -EINVAL; if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS))) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported)) return -EINVAL; if (wiphy->pmsr_capa && wiphy->pmsr_capa->ftm.supported) { if (WARN_ON(!wiphy->pmsr_capa->ftm.asap && !wiphy->pmsr_capa->ftm.non_asap)) return -EINVAL; if (WARN_ON(!wiphy->pmsr_capa->ftm.preambles || !wiphy->pmsr_capa->ftm.bandwidths)) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa->ftm.preambles & ~(BIT(NL80211_PREAMBLE_LEGACY) | BIT(NL80211_PREAMBLE_HT) | BIT(NL80211_PREAMBLE_VHT) | BIT(NL80211_PREAMBLE_HE) | BIT(NL80211_PREAMBLE_DMG)))) return -EINVAL; if (WARN_ON((wiphy->pmsr_capa->ftm.trigger_based || wiphy->pmsr_capa->ftm.non_trigger_based) && !(wiphy->pmsr_capa->ftm.preambles & BIT(NL80211_PREAMBLE_HE)))) return -EINVAL; if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths & ~(BIT(NL80211_CHAN_WIDTH_20_NOHT) | BIT(NL80211_CHAN_WIDTH_20) | BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80) | BIT(NL80211_CHAN_WIDTH_80P80) | BIT(NL80211_CHAN_WIDTH_160) | BIT(NL80211_CHAN_WIDTH_5) | BIT(NL80211_CHAN_WIDTH_10)))) return -EINVAL; } if (WARN_ON((wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) && (wiphy->regulatory_flags & (REGULATORY_CUSTOM_REG | REGULATORY_STRICT_REG | REGULATORY_COUNTRY_IE_FOLLOW_POWER | REGULATORY_COUNTRY_IE_IGNORE)))) return -EINVAL; if (WARN_ON(wiphy->coalesce && (!wiphy->coalesce->n_rules || !wiphy->coalesce->n_patterns) && (!wiphy->coalesce->pattern_min_len || wiphy->coalesce->pattern_min_len > wiphy->coalesce->pattern_max_len))) return -EINVAL; if (WARN_ON(wiphy->ap_sme_capa && !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME))) return -EINVAL; if (WARN_ON(wiphy->addresses && !wiphy->n_addresses)) return -EINVAL; if (WARN_ON(wiphy->addresses && !is_zero_ether_addr(wiphy->perm_addr) && memcmp(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN))) return -EINVAL; if (WARN_ON(wiphy->max_acl_mac_addrs && (!(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME) || !rdev->ops->set_mac_acl))) return -EINVAL; /* assure only valid behaviours are flagged by driver * hence subtract 2 as bit 0 is invalid. */ if (WARN_ON(wiphy->bss_select_support && (wiphy->bss_select_support & ~(BIT(__NL80211_BSS_SELECT_ATTR_AFTER_LAST) - 2)))) return -EINVAL; if (WARN_ON(wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X) && (!rdev->ops->set_pmk || !rdev->ops->del_pmk))) return -EINVAL; if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) && rdev->ops->update_connect_params)) return -EINVAL; if (wiphy->addresses) memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN); /* sanity check ifmodes */ WARN_ON(!ifmodes); ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1; if (WARN_ON(ifmodes != wiphy->interface_modes)) wiphy->interface_modes = ifmodes; res = wiphy_verify_combinations(wiphy); if (res) return res; /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { const struct ieee80211_sband_iftype_data *iftd; u16 types = 0; bool have_he = false; sband = wiphy->bands[band]; if (!sband) continue; sband->band = band; if (WARN_ON(!sband->n_channels)) return -EINVAL; /* * on 60GHz or sub-1Ghz band, there are no legacy rates, so * n_bitrates is 0 */ if (WARN_ON((band != NL80211_BAND_60GHZ && band != NL80211_BAND_S1GHZ) && !sband->n_bitrates)) return -EINVAL; if (WARN_ON(band == NL80211_BAND_6GHZ && (sband->ht_cap.ht_supported || sband->vht_cap.vht_supported))) return -EINVAL; /* * Since cfg80211_disable_40mhz_24ghz is global, we can * modify the sband's ht data even if the driver uses a * global structure for that. */ if (cfg80211_disable_40mhz_24ghz && band == NL80211_BAND_2GHZ && sband->ht_cap.ht_supported) { sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40; } /* * Since we use a u32 for rate bitmaps in * ieee80211_get_response_rate, we cannot * have more than 32 legacy rates. */ if (WARN_ON(sband->n_bitrates > 32)) return -EINVAL; for (i = 0; i < sband->n_channels; i++) { sband->channels[i].orig_flags = sband->channels[i].flags; sband->channels[i].orig_mag = INT_MAX; sband->channels[i].orig_mpwr = sband->channels[i].max_power; sband->channels[i].band = band; if (WARN_ON(sband->channels[i].freq_offset >= 1000)) return -EINVAL; } for_each_sband_iftype_data(sband, i, iftd) { bool has_ap, has_non_ap; u32 ap_bits = BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_P2P_GO); if (WARN_ON(!iftd->types_mask)) return -EINVAL; if (WARN_ON(types & iftd->types_mask)) return -EINVAL; /* at least one piece of information must be present */ if (WARN_ON(!iftd->he_cap.has_he)) return -EINVAL; types |= iftd->types_mask; if (i == 0) have_he = iftd->he_cap.has_he; else have_he = have_he && iftd->he_cap.has_he; has_ap = iftd->types_mask & ap_bits; has_non_ap = iftd->types_mask & ~ap_bits; /* * For EHT 20 MHz STA, the capabilities format differs * but to simplify, don't check 20 MHz but rather check * only if AP and non-AP were mentioned at the same time, * reject if so. */ if (WARN_ON(iftd->eht_cap.has_eht && has_ap && has_non_ap)) return -EINVAL; } if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ)) return -EINVAL; have_band = true; } if (!have_band) { WARN_ON(1); return -EINVAL; } for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) { /* * Validate we have a policy (can be explicitly set to * VENDOR_CMD_RAW_DATA which is non-NULL) and also that * we have at least one of doit/dumpit. */ if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy)) return -EINVAL; if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit && !rdev->wiphy.vendor_commands[i].dumpit)) return -EINVAL; } #ifdef CONFIG_PM if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns && (!rdev->wiphy.wowlan->pattern_min_len || rdev->wiphy.wowlan->pattern_min_len > rdev->wiphy.wowlan->pattern_max_len))) return -EINVAL; #endif if (!wiphy->max_num_akm_suites) wiphy->max_num_akm_suites = NL80211_MAX_NR_AKM_SUITES; else if (wiphy->max_num_akm_suites < NL80211_MAX_NR_AKM_SUITES || wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES) return -EINVAL; /* check and set up bitrates */ ieee80211_set_bitrate_flags(wiphy); rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); if (res) { wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return res; } list_add_rcu(&rdev->list, &cfg80211_rdev_list); cfg80211_rdev_list_generation++; /* add to debugfs */ rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy), ieee80211_debugfs_dir); cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); wiphy_unlock(&rdev->wiphy); /* set up regulatory info */ wiphy_regulatory_register(wiphy); if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { struct regulatory_request request; request.wiphy_idx = get_wiphy_idx(wiphy); request.initiator = NL80211_REGDOM_SET_BY_DRIVER; request.alpha2[0] = '9'; request.alpha2[1] = '9'; nl80211_send_reg_change_event(&request); } /* Check that nobody globally advertises any capabilities they do not * advertise on all possible interface types. */ if (wiphy->extended_capabilities_len && wiphy->num_iftype_ext_capab && wiphy->iftype_ext_capab) { u8 supported_on_all, j; const struct wiphy_iftype_ext_capab *capab; capab = wiphy->iftype_ext_capab; for (j = 0; j < wiphy->extended_capabilities_len; j++) { if (capab[0].extended_capabilities_len > j) supported_on_all = capab[0].extended_capabilities[j]; else supported_on_all = 0x00; for (i = 1; i < wiphy->num_iftype_ext_capab; i++) { if (j >= capab[i].extended_capabilities_len) { supported_on_all = 0x00; break; } supported_on_all &= capab[i].extended_capabilities[j]; } if (WARN_ON(wiphy->extended_capabilities[j] & ~supported_on_all)) break; } } rdev->wiphy.registered = true; rtnl_unlock(); res = rfkill_register(rdev->wiphy.rfkill); if (res) { rfkill_destroy(rdev->wiphy.rfkill); rdev->wiphy.rfkill = NULL; wiphy_unregister(&rdev->wiphy); return res; } return 0; } EXPORT_SYMBOL(wiphy_register); void wiphy_rfkill_start_polling(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (!rdev->ops->rfkill_poll) return; rdev->rfkill_ops.poll = cfg80211_rfkill_poll; rfkill_resume_polling(wiphy->rfkill); } EXPORT_SYMBOL(wiphy_rfkill_start_polling); void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev, struct wiphy_work *end) { unsigned int runaway_limit = 100; unsigned long flags; lockdep_assert_held(&rdev->wiphy.mtx); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); while (!list_empty(&rdev->wiphy_work_list)) { struct wiphy_work *wk; wk = list_first_entry(&rdev->wiphy_work_list, struct wiphy_work, entry); list_del_init(&wk->entry); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); trace_wiphy_work_run(&rdev->wiphy, wk); wk->func(&rdev->wiphy, wk); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); if (wk == end) break; if (WARN_ON(--runaway_limit == 0)) INIT_LIST_HEAD(&rdev->wiphy_work_list); } spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); } void wiphy_unregister(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); wait_event(rdev->dev_wait, ({ int __count; wiphy_lock(&rdev->wiphy); __count = rdev->opencount; wiphy_unlock(&rdev->wiphy); __count == 0; })); if (rdev->wiphy.rfkill) rfkill_unregister(rdev->wiphy.rfkill); rtnl_lock(); wiphy_lock(&rdev->wiphy); nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); rdev->wiphy.registered = false; WARN_ON(!list_empty(&rdev->wiphy.wdev_list)); /* * First remove the hardware from everywhere, this makes * it impossible to find from userspace. */ debugfs_remove_recursive(rdev->wiphy.debugfsdir); list_del_rcu(&rdev->list); synchronize_rcu(); /* * If this device got a regulatory hint tell core its * free to listen now to a new shiny device regulatory hint */ wiphy_regulatory_deregister(wiphy); cfg80211_rdev_list_generation++; device_del(&rdev->wiphy.dev); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) rdev_set_wakeup(rdev, false); #endif /* surely nothing is reachable now, clean up work */ cfg80211_process_wiphy_works(rdev, NULL); wiphy_unlock(&rdev->wiphy); rtnl_unlock(); /* this has nothing to do now but make sure it's gone */ cancel_work_sync(&rdev->wiphy_work); cancel_work_sync(&rdev->conn_work); flush_work(&rdev->event_work); cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); cancel_delayed_work_sync(&rdev->background_cac_done_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); flush_work(&rdev->mgmt_registrations_update_wk); flush_work(&rdev->background_cac_abort_wk); cfg80211_rdev_free_wowlan(rdev); cfg80211_free_coalesce(rdev->coalesce); rdev->coalesce = NULL; } EXPORT_SYMBOL(wiphy_unregister); void cfg80211_dev_free(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *scan, *tmp; struct cfg80211_beacon_registration *reg, *treg; rfkill_destroy(rdev->wiphy.rfkill); list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) { list_del(&reg->list); kfree(reg); } list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list) cfg80211_put_bss(&rdev->wiphy, &scan->pub); mutex_destroy(&rdev->wiphy.mtx); /* * The 'regd' can only be non-NULL if we never finished * initializing the wiphy and thus never went through the * unregister path - e.g. in failure scenarios. Thus, it * cannot have been visible to anyone if non-NULL, so we * can just free it here. */ kfree(rcu_dereference_raw(rdev->wiphy.regd)); kfree(rdev); } void wiphy_free(struct wiphy *wiphy) { put_device(&wiphy->dev); } EXPORT_SYMBOL(wiphy_free); void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, enum rfkill_hard_block_reasons reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); if (rfkill_set_hw_state_reason(wiphy->rfkill, blocked, reason)) schedule_work(&rdev->rfkill_block); } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, bool unregister_netdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_cqm_config *cqm_config; unsigned int link_id; ASSERT_RTNL(); lockdep_assert_held(&rdev->wiphy.mtx); nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); wdev->registered = false; if (wdev->netdev) { sysfs_remove_link(&wdev->netdev->dev.kobj, "phy80211"); if (unregister_netdev) unregister_netdevice(wdev->netdev); } list_del_rcu(&wdev->list); synchronize_net(); rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); break; case NL80211_IFTYPE_NAN: cfg80211_stop_nan(rdev, wdev); break; default: break; } #ifdef CONFIG_CFG80211_WEXT kfree_sensitive(wdev->wext.keys); wdev->wext.keys = NULL; #endif wiphy_work_cancel(wdev->wiphy, &wdev->cqm_rssi_work); /* deleted from the list, so can't be found from nl80211 any more */ cqm_config = rcu_access_pointer(wdev->cqm_config); kfree_rcu(cqm_config, rcu_head); RCU_INIT_POINTER(wdev->cqm_config, NULL); /* * Ensure that all events have been processed and * freed. */ cfg80211_process_wdev_events(wdev); if (wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) { for (link_id = 0; link_id < ARRAY_SIZE(wdev->links); link_id++) { struct cfg80211_internal_bss *curbss; curbss = wdev->links[link_id].client.current_bss; if (WARN_ON(curbss)) { cfg80211_unhold_bss(curbss); cfg80211_put_bss(wdev->wiphy, &curbss->pub); wdev->links[link_id].client.current_bss = NULL; } } } wdev->connected = false; } void cfg80211_unregister_wdev(struct wireless_dev *wdev) { _cfg80211_unregister_wdev(wdev, true); } EXPORT_SYMBOL(cfg80211_unregister_wdev); static const struct device_type wiphy_type = { .name = "wlan", }; void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num) { lockdep_assert_held(&rdev->wiphy.mtx); rdev->num_running_ifaces += num; if (iftype == NL80211_IFTYPE_MONITOR) rdev->num_running_monitor_ifaces += num; } void cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct net_device *dev = wdev->netdev; struct cfg80211_sched_scan_request *pos, *tmp; lockdep_assert_held(&rdev->wiphy.mtx); cfg80211_pmsr_wdev_down(wdev); cfg80211_stop_background_radar_detection(wdev); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, true); break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, list) { if (dev == pos->dev) cfg80211_stop_sched_scan_req(rdev, pos, false); } #ifdef CONFIG_CFG80211_WEXT kfree(wdev->wext.ie); wdev->wext.ie = NULL; wdev->wext.ie_len = 0; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; #endif cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; case NL80211_IFTYPE_MESH_POINT: cfg80211_leave_mesh(rdev, dev); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: /* cannot happen, has no netdev */ break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: /* nothing to do */ break; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: /* invalid */ break; } } void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_event *ev; unsigned long flags; trace_cfg80211_stop_iface(wiphy, wdev); ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_STOPPED; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_stop_iface); void cfg80211_init_wdev(struct wireless_dev *wdev) { INIT_LIST_HEAD(&wdev->event_list); spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); #ifdef CONFIG_CFG80211_WEXT wdev->wext.default_key = -1; wdev->wext.default_mgmt_key = -1; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; #endif wiphy_work_init(&wdev->cqm_rssi_work, cfg80211_cqm_rssi_notify_work); if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT) wdev->ps = true; else wdev->ps = false; /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; wdev->radio_mask = BIT(wdev->wiphy->n_radio) - 1; if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) wdev->netdev->priv_flags |= IFF_DONT_BRIDGE; INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); } void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { ASSERT_RTNL(); lockdep_assert_held(&rdev->wiphy.mtx); /* * We get here also when the interface changes network namespaces, * as it's registered into the new one, but we don't want it to * change ID in that case. Checking if the ID is already assigned * works, because 0 isn't considered a valid ID and the memory is * 0-initialized. */ if (!wdev->identifier) wdev->identifier = ++rdev->wdev_id; list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); rdev->devlist_generation++; wdev->registered = true; if (wdev->netdev && sysfs_create_link(&wdev->netdev->dev.kobj, &rdev->wiphy.dev.kobj, "phy80211")) pr_err("failed to add phy80211 symlink to netdev!\n"); nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } int cfg80211_register_netdevice(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; int ret; ASSERT_RTNL(); if (WARN_ON(!wdev)) return -EINVAL; rdev = wiphy_to_rdev(wdev->wiphy); lockdep_assert_held(&rdev->wiphy.mtx); /* we'll take care of this */ wdev->registered = true; wdev->registering = true; ret = register_netdevice(dev); if (ret) goto out; cfg80211_register_wdev(rdev, wdev); ret = 0; out: wdev->registering = false; if (ret) wdev->registered = false; return ret; } EXPORT_SYMBOL(cfg80211_register_netdevice); static int cfg80211_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *pos, *tmp; if (!wdev) return NOTIFY_DONE; rdev = wiphy_to_rdev(wdev->wiphy); WARN_ON(wdev->iftype == NL80211_IFTYPE_UNSPECIFIED); switch (state) { case NETDEV_POST_INIT: SET_NETDEV_DEVTYPE(dev, &wiphy_type); wdev->netdev = dev; /* can only change netns with wiphy */ dev->netns_local = true; cfg80211_init_wdev(wdev); break; case NETDEV_REGISTER: if (!wdev->registered) { wiphy_lock(&rdev->wiphy); cfg80211_register_wdev(rdev, wdev); wiphy_unlock(&rdev->wiphy); } break; case NETDEV_UNREGISTER: /* * It is possible to get NETDEV_UNREGISTER multiple times, * so check wdev->registered. */ if (wdev->registered && !wdev->registering) { wiphy_lock(&rdev->wiphy); _cfg80211_unregister_wdev(wdev, false); wiphy_unlock(&rdev->wiphy); } break; case NETDEV_GOING_DOWN: wiphy_lock(&rdev->wiphy); cfg80211_leave(rdev, wdev); cfg80211_remove_links(wdev); wiphy_unlock(&rdev->wiphy); /* since we just did cfg80211_leave() nothing to do there */ cancel_work_sync(&wdev->disconnect_wk); cancel_work_sync(&wdev->pmsr_free_wk); break; case NETDEV_DOWN: wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, list) { if (WARN_ON(pos->dev == wdev->netdev)) cfg80211_stop_sched_scan_req(rdev, pos, false); } rdev->opencount--; wiphy_unlock(&rdev->wiphy); wake_up(&rdev->dev_wait); break; case NETDEV_UP: wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, 1); switch (wdev->iftype) { #ifdef CONFIG_CFG80211_WEXT case NL80211_IFTYPE_ADHOC: cfg80211_ibss_wext_join(rdev, wdev); break; case NL80211_IFTYPE_STATION: cfg80211_mgd_wext_connect(rdev, wdev); break; #endif #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: { /* backward compat code... */ struct mesh_setup setup; memcpy(&setup, &default_mesh_setup, sizeof(setup)); /* back compat only needed for mesh_id */ setup.mesh_id = wdev->u.mesh.id; setup.mesh_id_len = wdev->u.mesh.id_up_len; if (wdev->u.mesh.id_up_len) __cfg80211_join_mesh(rdev, dev, &setup, &default_mesh_config); break; } #endif default: break; } rdev->opencount++; /* * Configure power management to the driver here so that its * correctly set also after interface type changes etc. */ if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && rdev->ops->set_power_mgmt && rdev_set_power_mgmt(rdev, dev, wdev->ps, wdev->ps_timeout)) { /* assume this means it's off */ wdev->ps = false; } wiphy_unlock(&rdev->wiphy); break; case NETDEV_PRE_UP: if (!cfg80211_iftype_allowed(wdev->wiphy, wdev->iftype, wdev->use_4addr, 0)) return notifier_from_errno(-EOPNOTSUPP); if (rfkill_blocked(rdev->wiphy.rfkill)) return notifier_from_errno(-ERFKILL); break; default: return NOTIFY_DONE; } wireless_nlevent_flush(); return NOTIFY_OK; } static struct notifier_block cfg80211_netdev_notifier = { .notifier_call = cfg80211_netdev_notifier_call, }; static void __net_exit cfg80211_pernet_exit(struct net *net) { struct cfg80211_registered_device *rdev; rtnl_lock(); for_each_rdev(rdev) { if (net_eq(wiphy_net(&rdev->wiphy), net)) WARN_ON(cfg80211_switch_netns(rdev, &init_net)); } rtnl_unlock(); } static struct pernet_operations cfg80211_pernet_ops = { .exit = cfg80211_pernet_exit, }; void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long flags; trace_wiphy_work_queue(wiphy, work); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); if (list_empty(&work->entry)) list_add_tail(&work->entry, &rdev->wiphy_work_list); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); queue_work(system_unbound_wq, &rdev->wiphy_work); } EXPORT_SYMBOL_GPL(wiphy_work_queue); void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long flags; lockdep_assert_held(&wiphy->mtx); trace_wiphy_work_cancel(wiphy, work); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); if (!list_empty(&work->entry)) list_del_init(&work->entry); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); } EXPORT_SYMBOL_GPL(wiphy_work_cancel); void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long flags; bool run; trace_wiphy_work_flush(wiphy, work); spin_lock_irqsave(&rdev->wiphy_work_lock, flags); run = !work || !list_empty(&work->entry); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); if (run) cfg80211_process_wiphy_works(rdev, work); } EXPORT_SYMBOL_GPL(wiphy_work_flush); void wiphy_delayed_work_timer(struct timer_list *t) { struct wiphy_delayed_work *dwork = from_timer(dwork, t, timer); wiphy_work_queue(dwork->wiphy, &dwork->work); } EXPORT_SYMBOL(wiphy_delayed_work_timer); void wiphy_delayed_work_queue(struct wiphy *wiphy, struct wiphy_delayed_work *dwork, unsigned long delay) { trace_wiphy_delayed_work_queue(wiphy, &dwork->work, delay); if (!delay) { del_timer(&dwork->timer); wiphy_work_queue(wiphy, &dwork->work); return; } dwork->wiphy = wiphy; mod_timer(&dwork->timer, jiffies + delay); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_queue); void wiphy_delayed_work_cancel(struct wiphy *wiphy, struct wiphy_delayed_work *dwork) { lockdep_assert_held(&wiphy->mtx); del_timer_sync(&dwork->timer); wiphy_work_cancel(wiphy, &dwork->work); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_cancel); void wiphy_delayed_work_flush(struct wiphy *wiphy, struct wiphy_delayed_work *dwork) { lockdep_assert_held(&wiphy->mtx); del_timer_sync(&dwork->timer); wiphy_work_flush(wiphy, &dwork->work); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_flush); bool wiphy_delayed_work_pending(struct wiphy *wiphy, struct wiphy_delayed_work *dwork) { return timer_pending(&dwork->timer); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_pending); static int __init cfg80211_init(void) { int err; err = register_pernet_device(&cfg80211_pernet_ops); if (err) goto out_fail_pernet; err = wiphy_sysfs_init(); if (err) goto out_fail_sysfs; err = register_netdevice_notifier(&cfg80211_netdev_notifier); if (err) goto out_fail_notifier; err = nl80211_init(); if (err) goto out_fail_nl80211; ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL); err = regulatory_init(); if (err) goto out_fail_reg; cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM); if (!cfg80211_wq) { err = -ENOMEM; goto out_fail_wq; } return 0; out_fail_wq: regulatory_exit(); out_fail_reg: debugfs_remove(ieee80211_debugfs_dir); nl80211_exit(); out_fail_nl80211: unregister_netdevice_notifier(&cfg80211_netdev_notifier); out_fail_notifier: wiphy_sysfs_exit(); out_fail_sysfs: unregister_pernet_device(&cfg80211_pernet_ops); out_fail_pernet: return err; } fs_initcall(cfg80211_init); static void __exit cfg80211_exit(void) { debugfs_remove(ieee80211_debugfs_dir); nl80211_exit(); unregister_netdevice_notifier(&cfg80211_netdev_notifier); wiphy_sysfs_exit(); regulatory_exit(); unregister_pernet_device(&cfg80211_pernet_ops); destroy_workqueue(cfg80211_wq); } module_exit(cfg80211_exit);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> static inline void tlb_flush(struct mmu_gather *tlb) { unsigned long start = 0UL, end = TLB_FLUSH_ALL; unsigned int stride_shift = tlb_get_unmap_shift(tlb); if (!tlb->fullmm && !tlb->need_flush_all) { start = tlb->start; end = tlb->end; } flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } /* * While x86 architecture in general requires an IPI to perform TLB * shootdown, enablement code for several hypervisors overrides * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing * a hypercall. To keep software pagetable walkers safe in this case we * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h * for more details. */ static inline void __tlb_remove_table(void *table) { free_page_and_swap_cache(table); } static inline void invlpg(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } #endif /* _ASM_X86_TLB_H */
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * include file for HSR and PRP. */ #ifndef __HSR_PRIVATE_H #define __HSR_PRIVATE_H #include <linux/netdevice.h> #include <linux/list.h> #include <linux/if_vlan.h> #include <linux/if_hsr.h> /* Time constants as specified in the HSR specification (IEC-62439-3 2010) * Table 8. * All values in milliseconds. */ #define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */ #define HSR_NODE_FORGET_TIME 60000 /* ms */ #define HSR_PROXY_NODE_FORGET_TIME 60000 /* ms */ #define HSR_ANNOUNCE_INTERVAL 100 /* ms */ #define HSR_ENTRY_FORGET_TIME 400 /* ms */ /* By how much may slave1 and slave2 timestamps of latest received frame from * each node differ before we notify of communication problem? */ #define MAX_SLAVE_DIFF 3000 /* ms */ #define HSR_SEQNR_START (USHRT_MAX - 1024) #define HSR_SUP_SEQNR_START (HSR_SEQNR_START / 2) /* How often shall we check for broken ring and remove node entries older than * HSR_NODE_FORGET_TIME? */ #define PRUNE_PERIOD 3000 /* ms */ #define PRUNE_PROXY_PERIOD 3000 /* ms */ #define HSR_TLV_EOT 0 /* End of TLVs */ #define HSR_TLV_ANNOUNCE 22 #define HSR_TLV_LIFE_CHECK 23 /* PRP V1 life check for Duplicate discard */ #define PRP_TLV_LIFE_CHECK_DD 20 /* PRP V1 life check for Duplicate Accept */ #define PRP_TLV_LIFE_CHECK_DA 21 /* PRP V1 life redundancy box MAC address */ #define PRP_TLV_REDBOX_MAC 30 #define HSR_V1_SUP_LSDUSIZE 52 /* The helper functions below assumes that 'path' occupies the 4 most * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or * equivalently, the 4 most significant bits of HSR tag byte 14). * * This is unclear in the IEC specification; its definition of MAC addresses * indicates the spec is written with the least significant bit first (to the * left). This, however, would mean that the LSDU field would be split in two * with the path field in-between, which seems strange. I'm guessing the MAC * address definition is in error. */ static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path) { ht->path_and_LSDU_size = htons((ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12)); } static inline void set_hsr_tag_LSDU_size(struct hsr_tag *ht, u16 LSDU_size) { ht->path_and_LSDU_size = htons((ntohs(ht->path_and_LSDU_size) & 0xF000) | (LSDU_size & 0x0FFF)); } struct hsr_ethhdr { struct ethhdr ethhdr; struct hsr_tag hsr_tag; } __packed; struct hsr_vlan_ethhdr { struct vlan_ethhdr vlanhdr; struct hsr_tag hsr_tag; } __packed; struct hsr_sup_tlv { u8 HSR_TLV_type; u8 HSR_TLV_length; } __packed; /* HSR/PRP Supervision Frame data types. * Field names as defined in the IEC:2010 standard for HSR. */ struct hsr_sup_tag { __be16 path_and_HSR_ver; __be16 sequence_nr; struct hsr_sup_tlv tlv; } __packed; struct hsr_sup_payload { unsigned char macaddress_A[ETH_ALEN]; } __packed; static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path) { set_hsr_tag_path((struct hsr_tag *)hst, path); } static inline void set_hsr_stag_HSR_ver(struct hsr_sup_tag *hst, u16 HSR_ver) { set_hsr_tag_LSDU_size((struct hsr_tag *)hst, HSR_ver); } struct hsrv0_ethhdr_sp { struct ethhdr ethhdr; struct hsr_sup_tag hsr_sup; } __packed; struct hsrv1_ethhdr_sp { struct ethhdr ethhdr; struct hsr_tag hsr; struct hsr_sup_tag hsr_sup; } __packed; enum hsr_port_type { HSR_PT_NONE = 0, /* Must be 0, used by framereg */ HSR_PT_SLAVE_A, HSR_PT_SLAVE_B, HSR_PT_INTERLINK, HSR_PT_MASTER, HSR_PT_PORTS, /* This must be the last item in the enum */ }; /* PRP Redunancy Control Trailor (RCT). * As defined in IEC-62439-4:2012, the PRP RCT is really { sequence Nr, * Lan indentifier (LanId), LSDU_size and PRP_suffix = 0x88FB }. * * Field names as defined in the IEC:2012 standard for PRP. */ struct prp_rct { __be16 sequence_nr; __be16 lan_id_and_LSDU_size; __be16 PRP_suffix; } __packed; static inline u16 get_prp_LSDU_size(struct prp_rct *rct) { return ntohs(rct->lan_id_and_LSDU_size) & 0x0FFF; } static inline void set_prp_lan_id(struct prp_rct *rct, u16 lan_id) { rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) & 0x0FFF) | (lan_id << 12)); } static inline void set_prp_LSDU_size(struct prp_rct *rct, u16 LSDU_size) { rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) & 0xF000) | (LSDU_size & 0x0FFF)); } struct hsr_port { struct list_head port_list; struct net_device *dev; struct hsr_priv *hsr; enum hsr_port_type type; }; struct hsr_frame_info; struct hsr_node; struct hsr_proto_ops { /* format and send supervision frame */ void (*send_sv_frame)(struct hsr_port *port, unsigned long *interval, const unsigned char addr[ETH_ALEN]); void (*handle_san_frame)(bool san, enum hsr_port_type port, struct hsr_node *node); bool (*drop_frame)(struct hsr_frame_info *frame, struct hsr_port *port); struct sk_buff * (*get_untagged_frame)(struct hsr_frame_info *frame, struct hsr_port *port); struct sk_buff * (*create_tagged_frame)(struct hsr_frame_info *frame, struct hsr_port *port); int (*fill_frame_info)(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame); bool (*invalid_dan_ingress_frame)(__be16 protocol); void (*update_san_info)(struct hsr_node *node, bool is_sup); }; struct hsr_self_node { unsigned char macaddress_A[ETH_ALEN]; unsigned char macaddress_B[ETH_ALEN]; struct rcu_head rcu_head; }; struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; struct list_head node_db; /* Known HSR nodes */ struct list_head proxy_node_db; /* RedBox HSR proxy nodes */ struct hsr_self_node __rcu *self_node; /* MACs of slaves */ struct timer_list announce_timer; /* Supervision frame dispatch */ struct timer_list announce_proxy_timer; struct timer_list prune_timer; struct timer_list prune_proxy_timer; int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ spinlock_t list_lock; /* locking for node list */ struct hsr_proto_ops *proto_ops; #define PRP_LAN_ID 0x5 /* 0x1010 for A and 0x1011 for B. Bit 0 is set * based on SLAVE_A or SLAVE_B */ u8 net_id; /* for PRP, it occupies most significant 3 bits * of lan_id */ bool fwd_offloaded; /* Forwarding offloaded to HW */ bool redbox; /* Device supports HSR RedBox */ unsigned char macaddress_redbox[ETH_ALEN]; unsigned char sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16)); /* Align to u16 boundary to avoid unaligned access * in ether_addr_equal */ #ifdef CONFIG_DEBUG_FS struct dentry *node_tbl_root; #endif }; #define hsr_for_each_port(hsr, port) \ list_for_each_entry_rcu((port), &(hsr)->ports, port_list) struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); /* Caller must ensure skb is a valid HSR frame */ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) { struct hsr_ethhdr *hsr_ethhdr; hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(skb); return ntohs(hsr_ethhdr->hsr_tag.sequence_nr); } static inline struct prp_rct *skb_get_PRP_rct(struct sk_buff *skb) { unsigned char *tail = skb_tail_pointer(skb) - HSR_HLEN; struct prp_rct *rct = (struct prp_rct *)tail; if (rct->PRP_suffix == htons(ETH_P_PRP)) return rct; return NULL; } /* Assume caller has confirmed this skb is PRP suffixed */ static inline u16 prp_get_skb_sequence_nr(struct prp_rct *rct) { return ntohs(rct->sequence_nr); } /* assume there is a valid rct */ static inline bool prp_check_lsdu_size(struct sk_buff *skb, struct prp_rct *rct, bool is_sup) { struct ethhdr *ethhdr; int expected_lsdu_size; if (is_sup) { expected_lsdu_size = HSR_V1_SUP_LSDUSIZE; } else { ethhdr = (struct ethhdr *)skb_mac_header(skb); expected_lsdu_size = skb->len - 14; if (ethhdr->h_proto == htons(ETH_P_8021Q)) expected_lsdu_size -= 4; } return (expected_lsdu_size == get_prp_LSDU_size(rct)); } #if IS_ENABLED(CONFIG_DEBUG_FS) void hsr_debugfs_rename(struct net_device *dev); void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); void hsr_debugfs_term(struct hsr_priv *priv); void hsr_debugfs_create_root(void); void hsr_debugfs_remove_root(void); #else static inline void hsr_debugfs_rename(struct net_device *dev) { } static inline void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) {} static inline void hsr_debugfs_term(struct hsr_priv *priv) {} static inline void hsr_debugfs_create_root(void) {} static inline void hsr_debugfs_remove_root(void) {} #endif #endif /* __HSR_PRIVATE_H */
3 18 18 216 216 197 25 12 4 8 4 9 17 3 2 2 6 2 4 1223 2 2 60 21 59 59 59 59 63 5 21 55 8 236 236 236 235 6 123 123 123 28 36 5 10 20 16 8 5 5 5 8 8 8 199 15 62 125 151 151 3 151 151 141 10 3 7 2 1 2 2 25 3 17 5 1 2 15 4 15 6 11 4 4 1 1 5 4 1 489 746 489 751 121 119 1 342 16 9 42 274 6 3 8 2 3 5 1 1 4 4 2 2 2 4 3 3 2 1 2 1 5 260 53 3 6 3 1 8 2 2 2 3 3 13 14 5 1 11 1 1 1 21 3 1 1 12 7 3 2 4 2 2 3 3 1 3 2 1 12 6 2 7 1 1 1 1 1 28 3 2 1 1 262 242 2434 1452 1110 2431 1099 1453 2435 228 985 2431 1109 1443 1215 1211 1217 1209 2 1215 1215 1213 1213 395 824 1210 126 1212 1335 859 479 1338 1333 1394 1523 2429 2436 2433 2429 1449 1107 219 100 318 318 122 200 3 317 318 3 319 318 1527 249 882 462 462 463 462 457 3 884 1124 68 68 1461 1454 1465 1459 1461 3156 3153 841 838 148 148 148 16 16 4 12 479 2 478 426 426 426 18 13 2 11 4 1200 1201 1203 2 20 2 2 18 5 1191 61 47 7 3 4 3 4 28 4 3 3 36 27 4 8 27 107 200 208 188 43 42 98 190 171 171 78 77 78 457 454 409 410 149 16 16 17 17 17 9 284 284 286 284 286 285 285 258 256 258 255 254 126 102 1 1 1 52 50 3 496 495 494 399 144 495 216 216 215 193 38 216 216 3447 3461 3451 3454 3365 213 3447 3454 402 405 2 402 401 2 400 403 10 461 456 4 460 950 34 30 4 321 98 294 346 258 194 2429 1431 1024 2368 1368 1022 6571 6570 77 6567 6560 399 5429 1522 6580 6578 126 126 3 4 4 4 6 6 152 8 5 2 3 1 4 53 35 21 1477 189 24 165 189 189 66 66 53 11 4 1 6 5 53 7 3 17 41 47 115 72 21 22 48 24 58 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> #include <linux/mroute.h> #include <linux/mroute6.h> #include <linux/icmpv6.h> #include <linux/uaccess.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <linux/skbuff_ref.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> #include <net/proto_memory.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> #include <linux/sock_diag.h> #include <linux/filter.h> #include <net/sock_reuseport.h> #include <net/bpf_sk_storage.h> #include <trace/events/sock.h> #include <net/tcp.h> #include <net/busy_poll.h> #include <net/phonet/phonet.h> #include <linux/ethtool.h> #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); static void sock_def_write_space_wfree(struct sock *sk); static void sock_def_write_space(struct sock *sk); /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in the user * namespace @user_ns. */ bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap) { return file_ns_capable(sk->sk_socket->file, user_ns, cap) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(sk_ns_capable); /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user * namespaces. */ bool sk_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, &init_user_ns, cap); } EXPORT_SYMBOL(sk_capable); /** * sk_net_capable - Network namespace socket capability test * @sk: Socket to use a capability on or through * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ bool sk_net_capable(const struct sock *sk, int cap) { return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); } EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have * one slock key per address family and separate keys for internal and * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ #define _sock_locks(x) \ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ x "27" , x "28" , x "AF_CAN" , \ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { _sock_locks("clock-") }; static const char *const af_family_kern_key_strings[AF_MAX+1] = { _sock_locks("k-sk_lock-") }; static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { _sock_locks("k-slock-") }; static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; static const char *const af_family_rlock_key_strings[AF_MAX+1] = { _sock_locks("rlock-") }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { _sock_locks("wlock-") }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { _sock_locks("elock-") }; /* * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; static struct lock_class_key af_rlock_keys[AF_MAX]; static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC * @sk: socket to set it on * * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. * It's the responsibility of the admin to adjust min_free_kbytes * to meet the requirements */ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward * progress of swapping. SOCK_MEMALLOC may be cleared while * it has rmem allocations due to the last swapfile being deactivated * but there is a risk that the socket is unusable due to exceeding * the rmem limits. Reclaim the reserves and obey rmem limits again. */ sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; } EXPORT_SYMBOL(__sk_backlog_rcv); void sk_error_report(struct sock *sk) { sk->sk_error_report(sk); switch (sk->sk_family) { case AF_INET: fallthrough; case AF_INET6: trace_inet_sk_error_report(sk); break; default: break; } } EXPORT_SYMBOL(sk_error_report); int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; tv.tv_usec = 0; } else { tv.tv_sec = timeo / HZ; tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; } if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; *(struct old_timeval32 *)optval = tv32; return sizeof(tv32); } if (old_timeval) { struct __kernel_old_timeval old_tv; old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; return sizeof(old_tv); } *(struct __kernel_sock_timeval *)optval = tv; return sizeof(tv); } EXPORT_SYMBOL(sock_get_timeout); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval) { if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; if (optlen < sizeof(tv32)) return -EINVAL; if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; tv->tv_sec = tv32.tv_sec; tv->tv_usec = tv32.tv_usec; } else if (old_timeval) { struct __kernel_old_timeval old_tv; if (optlen < sizeof(old_tv)) return -EINVAL; if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; tv->tv_sec = old_tv.tv_sec; tv->tv_usec = old_tv.tv_usec; } else { if (optlen < sizeof(*tv)) return -EINVAL; if (copy_from_sockptr(tv, optval, sizeof(*tv))) return -EFAULT; } return 0; } EXPORT_SYMBOL(sock_copy_user_timeval); static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, bool old_timeval) { struct __kernel_sock_timeval tv; int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); long val; if (err) return err; if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; if (tv.tv_sec < 0) { static int warned __read_mostly; WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", __func__, current->comm, task_pid_nr(current)); } return 0; } val = MAX_SCHEDULE_TIMEOUT; if ((tv.tv_sec || tv.tv_usec) && (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); WRITE_ONCE(*timeo_p, val); return 0; } static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { case AF_UNSPEC: case AF_UNIX: return false; default: return true; } } static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; if (sock_needs_netstamp(sk) && !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } skb->dev = NULL; skb_set_owner_r(skb, sk); /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(__sock_queue_rcv_skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { enum skb_drop_reason drop_reason; int err; err = sk_filter(sk, skb); if (err) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto out; } err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; break; case -ENOBUFS: drop_reason = SKB_DROP_REASON_PROTO_MEM; break; default: drop_reason = SKB_NOT_DROPPED_YET; break; } out: if (reason) *reason = drop_reason; return err; } EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } if (nested) bh_lock_sock_nested(sk); else bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* * trylock + unlock semantics: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; } bh_unlock_sock(sk); out: if (refcounted) sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); goto out; } EXPORT_SYMBOL(__sk_receive_skb); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(__sk_dst_check); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); if (dst && dst->obsolete && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; } return dst; } EXPORT_SYMBOL(sk_dst_check); static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); /* Sorry... */ ret = -EPERM; if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; if (ifindex < 0) goto out; /* Paired with all READ_ONCE() done locklessly. */ WRITE_ONCE(sk->sk_bound_dev_if, ifindex); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); ret = 0; out: #endif return ret; } int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; if (lock_sk) lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); if (lock_sk) release_sock(sk); return ret; } EXPORT_SYMBOL(sock_bindtoindex); static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index; ret = -EINVAL; if (optlen < 0) goto out; /* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the * name is "" or the option length is zero the socket * is not bound. */ if (optlen > IFNAMSIZ - 1) optlen = IFNAMSIZ - 1; memset(devname, 0, sizeof(devname)); ret = -EFAULT; if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; if (devname[0] != '\0') { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, devname); if (dev) index = dev->ifindex; rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; } sockopt_lock_sock(sk); ret = sock_bindtoindex_locked(sk, index); sockopt_release_sock(sk); out: #endif return ret; } static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; if (bound_dev_if == 0) { len = 0; goto zero; } ret = -EINVAL; if (len < IFNAMSIZ) goto out; ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; len = strlen(devname) + 1; ret = -EFAULT; if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; out: #endif return ret; } bool sk_mc_loop(const struct sock *sk) { if (dev_recursion_level()) return false; if (!sk) return true; /* IPV6_ADDRFORM can change sk->sk_family under us. */ switch (READ_ONCE(sk->sk_family)) { case AF_INET: return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_test_bit(MC6_LOOP, sk); #endif } WARN_ON_ONCE(1); return true; } EXPORT_SYMBOL(sk_mc_loop); void sock_set_reuseaddr(struct sock *sk) { lock_sock(sk); sk->sk_reuse = SK_CAN_REUSE; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseaddr); void sock_set_reuseport(struct sock *sk) { lock_sock(sk); sk->sk_reuseport = true; release_sock(sk); } EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { WRITE_ONCE(sk->sk_priority, priority); } EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); sock_enable_timestamp(sk, SOCK_TIMESTAMP); } } void sock_enable_timestamps(struct sock *sk) { lock_sock(sk); __sock_set_timestamps(sk, true, false, true); release_sock(sk); } EXPORT_SYMBOL(sock_enable_timestamps); void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { case SO_TIMESTAMP_OLD: __sock_set_timestamps(sk, valbool, false, false); break; case SO_TIMESTAMP_NEW: __sock_set_timestamps(sk, valbool, true, false); break; case SO_TIMESTAMPNS_OLD: __sock_set_timestamps(sk, valbool, false, true); break; case SO_TIMESTAMPNS_NEW: __sock_set_timestamps(sk, valbool, true, true); break; } } static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) { struct net *net = sock_net(sk); struct net_device *dev = NULL; bool match = false; int *vclock_index; int i, num; if (sk->sk_bound_dev_if) dev = dev_get_by_index(net, sk->sk_bound_dev_if); if (!dev) { pr_err("%s: sock not bind to device\n", __func__); return -EOPNOTSUPP; } num = ethtool_get_phc_vclocks(dev, &vclock_index); dev_put(dev); for (i = 0; i < num; i++) { if (*(vclock_index + i) == phc_index) { match = true; break; } } if (num > 0) kfree(vclock_index); if (!match) return -EINVAL; WRITE_ONCE(sk->sk_bind_phc, phc_index); return 0; } int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping) { int val = timestamping.flags; int ret; if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP && !(val & SOF_TIMESTAMPING_OPT_ID)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP) atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); else atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { atomic_set(&sk->sk_tskey, 0); } } if (val & SOF_TIMESTAMPING_OPT_STATS && !(val & SOF_TIMESTAMPING_OPT_TSONLY)) return -EINVAL; if (val & SOF_TIMESTAMPING_BIND_PHC) { ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); if (ret) return ret; } WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); return 0; } void sock_set_keepalive(struct sock *sk) { lock_sock(sk); if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, true); sock_valbool_flag(sk, SOCK_KEEPOPEN, true); release_sock(sk); } EXPORT_SYMBOL(sock_set_keepalive); static void __sock_set_rcvbuf(struct sock *sk, int val) { /* Ensure val * 2 fits into an int, to prevent max_t() from treating it * as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* We double it on the way in to account for "struct sk_buff" etc. * overhead. Applications assume that the SO_RCVBUF setting they make * will allow that much actual data to be received on that socket. * * Applications are unaware that "struct sk_buff" and other overheads * allocate from the receive buffer during socket buffer allocation. * * And after considering the possible alternatives, returning the value * we actually used in getsockopt is the most desirable behavior. */ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); } void sock_set_rcvbuf(struct sock *sk, int val) { lock_sock(sk); __sock_set_rcvbuf(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } void sock_set_mark(struct sock *sk, u32 val) { lock_sock(sk); __sock_set_mark(sk, val); release_sock(sk); } EXPORT_SYMBOL(sock_set_mark); static void sock_release_reserved_memory(struct sock *sk, int bytes) { /* Round down bytes to multiple of pages */ bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); sk_mem_reclaim(sk); } static int sock_reserve_memory(struct sock *sk, int bytes) { long allocated; bool charged; int pages; if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) return 0; pages = sk_mem_pages(bytes); /* pre-charge to memcg */ charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); /* If the system goes into memory pressure with this * precharge, give up and return error. */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem + (pages << PAGE_SHIFT)); return 0; } #ifdef CONFIG_PAGE_POOL /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED * in 1 syscall. The limit exists to limit the amount of memory the kernel * allocates to copy these tokens, and to prevent looping over the frags for * too long. */ #define MAX_DONTNEED_TOKENS 128 #define MAX_DONTNEED_FRAGS 1024 static noinline_for_stack int sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) { unsigned int num_tokens, i, j, k, netmem_num = 0; struct dmabuf_token *tokens; int ret = 0, num_frags = 0; netmem_ref netmems[16]; if (!sk_is_tcp(sk)) return -EBADF; if (optlen % sizeof(*tokens) || optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) return -EINVAL; num_tokens = optlen / sizeof(*tokens); tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); if (!tokens) return -ENOMEM; if (copy_from_sockptr(tokens, optval, optlen)) { kvfree(tokens); return -EFAULT; } xa_lock_bh(&sk->sk_user_frags); for (i = 0; i < num_tokens; i++) { for (j = 0; j < tokens[i].token_count; j++) { if (++num_frags > MAX_DONTNEED_FRAGS) goto frag_limit_reached; netmem_ref netmem = (__force netmem_ref)__xa_erase( &sk->sk_user_frags, tokens[i].token_start + j); if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) continue; netmems[netmem_num++] = netmem; if (netmem_num == ARRAY_SIZE(netmems)) { xa_unlock_bh(&sk->sk_user_frags); for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); netmem_num = 0; xa_lock_bh(&sk->sk_user_frags); } ret++; } } frag_limit_reached: xa_unlock_bh(&sk->sk_user_frags); for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); kvfree(tokens); return ret; } #endif void sockopt_lock_sock(struct sock *sk) { /* When current->bpf_ctx is set, the setsockopt is called from * a bpf prog. bpf has ensured the sk lock has been * acquired before calling setsockopt(). */ if (has_current_bpf_ctx()) return; lock_sock(sk); } EXPORT_SYMBOL(sockopt_lock_sock); void sockopt_release_sock(struct sock *sk) { if (has_current_bpf_ctx()) return; release_sock(sk); } EXPORT_SYMBOL(sockopt_release_sock); bool sockopt_ns_capable(struct user_namespace *ns, int cap) { return has_current_bpf_ctx() || ns_capable(ns, cap); } EXPORT_SYMBOL(sockopt_ns_capable); bool sockopt_capable(int cap) { return has_current_bpf_ctx() || capable(cap); } EXPORT_SYMBOL(sockopt_capable); static int sockopt_validate_clockid(__kernel_clockid_t value) { switch (value) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_TAI: return 0; } return -EINVAL; } /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; int val; int valbool; struct linger ling; int ret = 0; /* * Options without arguments */ if (optname == SO_BINDTODEVICE) return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; /* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: if ((val >= 0 && val <= 6) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { sock_set_priority(sk, val); return 0; } return -EPERM; case SO_PASSSEC: assign_bit(SOCK_PASSSEC, &sock->flags, valbool); return 0; case SO_PASSCRED: assign_bit(SOCK_PASSCRED, &sock->flags, valbool); return 0; case SO_PASSPIDFD: assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: case SO_ERROR: return -ENOPROTOOPT; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: if (val < 0) return -EINVAL; WRITE_ONCE(sk->sk_ll_usec, val); return 0; case SO_PREFER_BUSY_POLL: if (valbool && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM; WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); return 0; case SO_BUSY_POLL_BUDGET: if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM; if (val < 0 || val > U16_MAX) return -EINVAL; WRITE_ONCE(sk->sk_busy_poll_budget, val); return 0; #endif case SO_MAX_PACING_RATE: { unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; unsigned long pacing_rate; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && copy_from_sockptr(&ulval, optval, sizeof(ulval))) { return -EFAULT; } if (ulval != ~0UL) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); /* Pairs with READ_ONCE() from sk_getsockopt() */ WRITE_ONCE(sk->sk_max_pacing_rate, ulval); pacing_rate = READ_ONCE(sk->sk_pacing_rate); if (ulval < pacing_rate) WRITE_ONCE(sk->sk_pacing_rate, ulval); return 0; } case SO_TXREHASH: if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT) val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() * and sk_getsockopt(). */ WRITE_ONCE(sk->sk_txrehash, (u8)val); return 0; case SO_PEEK_OFF: { int (*set_peek_off)(struct sock *sk, int val); set_peek_off = READ_ONCE(sock->ops)->set_peek_off; if (set_peek_off) ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; return ret; } #ifdef CONFIG_PAGE_POOL case SO_DEVMEM_DONTNEED: return sock_devmem_dontneed(sk, optval, optlen); #endif } sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); break; case SO_SNDBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; case SO_SNDBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ if (val < 0) val = 0; goto set_sndbuf; case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ __sock_set_rcvbuf(sk, max(val, 0)); break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; case SO_OOBINLINE: sock_valbool_flag(sk, SOCK_URGINLINE, valbool); break; case SO_NO_CHECK: sk->sk_no_check_tx = valbool; break; case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ break; } if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); } else { unsigned long t_sec = ling.l_linger; if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: sock_set_timestamp(sk, optname, valbool); break; case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: if (optlen == sizeof(timestamping)) { if (copy_from_sockptr(&timestamping, optval, sizeof(timestamping))) { ret = -EFAULT; break; } } else { memset(&timestamping, 0, sizeof(timestamping)); timestamping.flags = val; } ret = sock_set_timestamping(sk, optname, timestamping); break; case SO_RCVLOWAT: { int (*set_rcvlowat)(struct sock *sk, int val) = NULL; if (val < 0) val = INT_MAX; if (sock) set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; if (set_rcvlowat) ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); break; case SO_ATTACH_FILTER: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_attach_filter(&fprog, sk); break; } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; case SO_ATTACH_REUSEPORT_CBPF: { struct sock_fprog fprog; ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); break; } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); } break; case SO_DETACH_REUSEPORT_BPF: ret = reuseport_detach_prog(sk); break; case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; case SO_LOCK_FILTER: if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) ret = -EPERM; else sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } __sock_set_mark(sk, val); break; case SO_RCVMARK: sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; case SO_WIFI_STATUS: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; case SO_SELECT_ERR_QUEUE: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; case SO_CNX_ADVICE: if (val == 1) dst_negative_advice(sk); break; case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) ret = -EOPNOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -EOPNOTSUPP; } if (!ret) { if (val < 0 || val > 1) ret = -EINVAL; else sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); } break; case SO_TXTIME: if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; break; } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; break; } /* CLOCK_MONOTONIC is only used by sch_fq, and this packet * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } ret = sockopt_validate_clockid(sk_txtime.clockid); if (ret) break; sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); sk->sk_txtime_report_errors = !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: ret = sock_bindtoindex_locked(sk, val); break; case SO_BUF_LOCK: if (val & ~SOCK_BUF_LOCK_MASK) { ret = -EINVAL; break; } sk->sk_userlocks = val | (sk->sk_userlocks & ~SOCK_BUF_LOCK_MASK); break; case SO_RESERVE_MEM: { int delta; if (val < 0) { ret = -EINVAL; break; } delta = val - sk->sk_reserved_mem; if (delta < 0) sock_release_reserved_memory(sk, -delta); else ret = sock_reserve_memory(sk, delta); break; } default: ret = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return ret; } int sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { return sk_setsockopt(sock->sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) { const struct cred *cred; spin_lock(&sk->sk_peer_lock); cred = get_cred(sk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); return cred; } static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; if (cred) { struct user_namespace *current_ns = current_user_ns(); ucred->uid = from_kuid_munged(current_ns, cred->euid); ucred->gid = from_kgid_munged(current_ns, cred->egid); } } static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; for (i = 0; i < src->ngroups; i++) { gid_t gid = from_kgid_munged(user_ns, src->gid[i]); if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; } return 0; } int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct socket *sock = sk->sk_socket; union { int val; u64 val64; unsigned long ulval; struct linger ling; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; struct sock_txtime txtime; struct so_timestamping timestamping; } v; int lv = sizeof(int); int len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; memset(&v, 0, sizeof(v)); switch (optname) { case SO_DEBUG: v.val = sock_flag(sk, SOCK_DBG); break; case SO_DONTROUTE: v.val = sock_flag(sk, SOCK_LOCALROUTE); break; case SO_BROADCAST: v.val = sock_flag(sk, SOCK_BROADCAST); break; case SO_SNDBUF: v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: v.val = sk->sk_reuse; break; case SO_REUSEPORT: v.val = sk->sk_reuseport; break; case SO_KEEPALIVE: v.val = sock_flag(sk, SOCK_KEEPOPEN); break; case SO_TYPE: v.val = sk->sk_type; break; case SO_PROTOCOL: v.val = sk->sk_protocol; break; case SO_DOMAIN: v.val = sk->sk_family; break; case SO_ERROR: v.val = -sock_error(sk); if (v.val == 0) v.val = xchg(&sk->sk_err_soft, 0); break; case SO_OOBINLINE: v.val = sock_flag(sk, SOCK_URGINLINE); break; case SO_NO_CHECK: v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: break; case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; case SO_TIMESTAMPNS_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMP_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPNS_NEW: v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); break; case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW: lv = sizeof(v.timestamping); /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only * returning the flags when they were set through the same option. * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. */ if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { v.timestamping.flags = READ_ONCE(sk->sk_tsflags); v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); } break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: v.val = 1; break; case SO_PASSCRED: v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; case SO_PASSPIDFD: v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); break; case SO_PEERCRED: { struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } case SO_PEERPIDFD: { struct pid *peer_pid; struct file *pidfd_file = NULL; int pidfd; if (len > sizeof(pidfd)) len = sizeof(pidfd); spin_lock(&sk->sk_peer_lock); peer_pid = get_pid(sk->sk_peer_pid); spin_unlock(&sk->sk_peer_lock); if (!peer_pid) return -ENODATA; pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); put_pid(peer_pid); if (pidfd < 0) return pidfd; if (copy_to_sockptr(optval, &pidfd, len) || copy_to_sockptr(optlen, &len, sizeof(int))) { put_unused_fd(pidfd); fput(pidfd_file); return -EFAULT; } fd_install(pidfd, pidfd_file); return 0; } case SO_PEERGROUPS: { const struct cred *cred; int ret, n; cred = sk_get_peer_cred(sk); if (!cred) return -ENODATA; n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; goto lenout; } case SO_PEERNAME: { struct sockaddr_storage address; lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } /* Dubious BSD thing... Probably nobody even uses it, but * the UNIX standard wants it for whatever reason... -DaveM */ case SO_ACCEPTCONN: v.val = sk->sk_state == TCP_LISTEN; break; case SO_PASSSEC: v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); break; case SO_PEERSEC: return security_socket_getpeersec_stream(sock, optval, optlen, len); case SO_MARK: v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: v.val = sock_flag(sk, SOCK_RCVMARK); break; case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; case SO_WIFI_STATUS: v.val = sock_flag(sk, SOCK_WIFI_STATUS); break; case SO_PEEK_OFF: if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); break; case SO_BINDTODEVICE: return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: len = sk_get_filter(sk, optval, len); if (len < 0) return len; goto lenout; case SO_LOCK_FILTER: v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; case SO_BPF_EXTENSIONS: v.val = bpf_tell_extensions(); break; case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: v.val = READ_ONCE(sk->sk_ll_usec); break; case SO_PREFER_BUSY_POLL: v.val = READ_ONCE(sk->sk_prefer_busy_poll); break; #endif case SO_MAX_PACING_RATE: /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ v.val = min_t(unsigned long, ~0U, READ_ONCE(sk->sk_max_pacing_rate)); } break; case SO_INCOMING_CPU: v.val = READ_ONCE(sk->sk_incoming_cpu); break; case SO_MEMINFO: { u32 meminfo[SK_MEMINFO_VARS]; sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; } #ifdef CONFIG_NET_RX_BUSY_POLL case SO_INCOMING_NAPI_ID: v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ if (v.val < MIN_NAPI_ID) v.val = 0; break; #endif case SO_COOKIE: lv = sizeof(u64); if (len < lv) return -EINVAL; v.val64 = sock_gen_cookie(sk); break; case SO_ZEROCOPY: v.val = sock_flag(sk, SOCK_ZEROCOPY); break; case SO_TXTIME: lv = sizeof(v.txtime); v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; v.txtime.flags |= sk->sk_txtime_report_errors ? SOF_TXTIME_REPORT_ERRORS : 0; break; case SO_BINDTOIFINDEX: v.val = READ_ONCE(sk->sk_bound_dev_if); break; case SO_NETNS_COOKIE: lv = sizeof(u64); if (len != lv) return -EINVAL; v.val64 = sock_net(sk)->net_cookie; break; case SO_BUF_LOCK: v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; break; case SO_RESERVE_MEM: v.val = READ_ONCE(sk->sk_reserved_mem); break; case SO_TXREHASH: /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). */ return -ENOPROTOOPT; } if (len > lv) len = lv; if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } /* * Initialize an sk_lock. * * (We also register the sk_lock with the lock validator.) */ static inline void sock_lock_init(struct sock *sk) { if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, af_family_kern_slock_key_strings[sk->sk_family], af_family_kern_slock_keys + sk->sk_family, af_family_kern_key_strings[sk->sk_family], af_family_kern_keys + sk->sk_family); else sock_lock_init_class_and_name( sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], af_family_keys + sk->sk_family); } /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, * even temporarily, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) { const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif /* If we move sk_tx_queue_mapping out of the private section, * we must check if sk_tx_queue_clear() is called after * sock_copy() in sk_clone_lock(). */ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < offsetof(struct sock, sk_dontcopy_begin) || offsetof(struct sock, sk_tx_queue_mapping) >= offsetof(struct sock, sk_dontcopy_end)); memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end), /* alloc is larger than struct, see sk_prot_alloc() */); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); #endif } static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { struct sock *sk; struct kmem_cache *slab; slab = prot->slab; if (slab != NULL) { sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) goto out_free_sec; } return sk; out_free_sec: security_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); return NULL; } static void sk_prot_free(struct proto *prot, struct sock *sk) { struct kmem_cache *slab; struct module *owner; owner = prot->owner; slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); else kfree(sk); module_put(owner); } /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); sk_tx_queue_clear(sk); } return sk; } EXPORT_SYMBOL(sk_alloc); /* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners. */ static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); #ifdef CONFIG_BPF_SYSCALL bpf_sk_storage_free(sk); #endif if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; } /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); if (likely(sk->sk_net_refcnt)) put_net_track(sock_net(sk), &sk->ns_tracker); else __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); sk_prot_free(sk->sk_prot_creator, sk); } void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; } if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); } static void __sk_free(struct sock *sk) { if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk); } void sk_free(struct sock *sk) { /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); static void sk_init_common(struct sock *sk) { skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_receive_queue.lock, af_rlock_keys + sk->sk_family, af_family_rlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_write_queue.lock, af_wlock_keys + sk->sk_family, af_family_wlock_key_strings[sk->sk_family]); lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); if (sk->sk_kern_sock) lockdep_set_class_and_name(&sk->sk_callback_lock, af_kern_callback_keys + sk->sk_family, af_family_kern_clock_key_strings[sk->sk_family]); else lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; bool is_charged = true; struct sock *newsk; newsk = sk_prot_alloc(prot, priority, sk->sk_family); if (!newsk) goto out; sock_copy(newsk, sk); newsk->sk_prot_creator = prot; /* SANITY */ if (likely(newsk->sk_net_refcnt)) { get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); } else { /* Kernel sockets are not elevating the struct net refcount. * Instead, use a tracker to more easily detect if a layer * is not properly dismantling its kernel sockets at netns * destroy time. */ __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ refcount_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); RCU_INIT_POINTER(newsk->sk_filter, filter); rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new * socket if we couldn't charge it in the first place * as otherwise we uncharge the parent's filter. */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); sk_free_unlock_clone(newsk); newsk = NULL; goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); if (bpf_sk_storage_clone(sk, newsk)) { sk_free_unlock_clone(newsk); newsk = NULL; goto out; } /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. */ if (sk_user_data_is_nocopy(newsk)) newsk->sk_user_data = NULL; newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); /* Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); out: return newsk; } EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_free_unlock_clone(struct sock *sk) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ sk->sk_destruct = NULL; bh_unlock_sock(sk); sk_free(sk); } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { bool is_ipv6 = false; u32 max_size; #if IS_ENABLED(CONFIG_IPV6) is_ipv6 = (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : READ_ONCE(dst->dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; sk->sk_route_caps = dst->dev->features; if (sk_is_tcp(sk)) sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); /* * Simple resource managers for sockets. */ /* * Write buffer destructor automatically called from kfree_skb. */ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; bool free; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); sock_def_write_space_wfree(sk); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); return; } /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } /* * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); /* This variant of sock_wfree() is used by TCP, * since it sets SOCK_USE_WRITE_QUEUE. */ void __sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) __sk_free(sk); } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) return skb_set_owner_edemux(skb, sk); #endif skb->sk = sk; skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation * is enough to guarantee sk_free() won't free this sock until * all in-flight packets are completed */ refcount_add(skb->truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ if (skb_is_decrypted(skb)) return false; return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { if (skb_is_tcp_pure_ack(skb)) return; if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) return; skb_orphan(skb); } EXPORT_SYMBOL(skb_orphan_partial); /* * Read buffer destructor automatically called from kfree_skb. */ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; atomic_sub(len, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, len); } EXPORT_SYMBOL(sock_rfree); /* * Buffer destructor for skbs that are not used directly in read or write * path, e.g. for error handler skbs. Automatically called from kfree_skb. */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); } EXPORT_SYMBOL(sock_efree); /* Buffer destructor for prefetch/receive path where reference count may * not be held, e.g. for listen sockets. */ #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { struct sock *sk = skb->sk; if (!sk_is_refcounted(sk)) return; if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { inet_reqsk(sk)->rsk_listener = NULL; reqsk_free(inet_reqsk(sk)); return; } sock_gen_put(sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; read_lock_bh(&sk->sk_callback_lock); uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; read_unlock_bh(&sk->sk_callback_lock); return uid; } EXPORT_SYMBOL(sock_i_uid); unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; read_lock(&sk->sk_callback_lock); ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; read_unlock(&sk->sk_callback_lock); return ino; } EXPORT_SYMBOL(__sock_i_ino); unsigned long sock_i_ino(struct sock *sk) { unsigned long ino; local_bh_disable(); ino = __sock_i_ino(sk); local_bh_enable(); return ino; } EXPORT_SYMBOL(sock_i_ino); /* * Allocate a skb from the socket's send buffer. */ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; } } return NULL; } EXPORT_SYMBOL(sock_wmalloc); static void sock_ofree(struct sk_buff *skb) { struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_omem_alloc); } struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority) { struct sk_buff *skb; /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); if (!skb) return NULL; atomic_add(skb->truesize, &sk->sk_omem_alloc); skb->sk = sk; skb->destructor = sock_ofree; return skb; } /* * Allocate a memory block from the socket's option memory buffer. */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. */ atomic_add(size, &sk->sk_omem_alloc); mem = kmalloc(size, priority); if (mem) return mem; atomic_sub(size, &sk->sk_omem_alloc); } return NULL; } EXPORT_SYMBOL(sock_kmalloc); /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. */ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, const bool nullify) { if (WARN_ON_ONCE(!mem)) return; if (nullify) kfree_sensitive(mem); else kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } void sock_kfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, false); } EXPORT_SYMBOL(sock_kfree_s); void sock_kzfree_s(struct sock *sk, void *mem, int size) { __sock_kfree_s(sk, mem, size, true); } EXPORT_SYMBOL(sock_kzfree_s); /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; if (signal_pending(current)) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(sk), &wait); return timeo; } /* * Generic send/receive buffer handlers */ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { struct sk_buff *skb; long timeo; int err; timeo = sock_sndtimeo(sk, noblock); for (;;) { err = sock_error(sk); if (err != 0) goto failure; err = -EPIPE; if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) goto failure; if (signal_pending(current)) goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation); if (skb) skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: *errcode = err; return NULL; } EXPORT_SYMBOL(sock_alloc_send_pskb); int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; tsflags = *(u32 *)CMSG_DATA(cmsg); if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) return -EINVAL; sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; case SCM_TXTIME: if (!sock_flag(sk, SOCK_TXTIME)) return -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; case SCM_TS_OPT_ID: if (sk_is_tcp(sk)) return -EINVAL; tsflags = READ_ONCE(sk->sk_tsflags); if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) return -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(__sock_cmsg_send); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; ret = __sock_cmsg_send(sk, cmsg, sockc); if (ret) return ret; } return 0; } EXPORT_SYMBOL(sock_cmsg_send); static void sk_enter_memory_pressure(struct sock *sk) { if (!sk->sk_prot->enter_memory_pressure) return; sk->sk_prot->enter_memory_pressure(sk); } static void sk_leave_memory_pressure(struct sock *sk) { if (sk->sk_prot->leave_memory_pressure) { INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, tcp_leave_memory_pressure, sk); } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; if (memory_pressure && READ_ONCE(*memory_pressure)) WRITE_ONCE(*memory_pressure, 0); } } DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get * @pfrag: pointer to page_frag * @gfp: priority for memory allocation * * Note: While this allocator tries to use high order pages, there is * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER && !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } } pfrag->page = alloc_page(gfp); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; return true; } return false; } EXPORT_SYMBOL(skb_page_frag_refill); bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; } EXPORT_SYMBOL(sk_page_frag_refill); void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_bh(&sk->sk_lock.slock); schedule(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user(sk)) break; } finish_wait(&sk->sk_lock.wq, &wait); } void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); do { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); skb = next; } while (skb != NULL); spin_lock_bh(&sk->sk_lock.slock); } /* * Doing the zeroing here guarantee we can not loop forever * while a wild producer attempts to flood us. */ sk->sk_backlog.len = 0; } void __sk_flush_backlog(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); __release_sock(sk); if (sk->sk_prot->release_cb) INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, tcp_release_cb, sk); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(__sk_flush_backlog); /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate * @amt: pages to allocate * @kind: allocation type * * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. * * Unlike the globally shared limits among the sockets under same protocol, * consuming the budget of a memcg won't have direct effect on other ones. * So be optimistic about memcg's tolerance, and leave the callers to decide * whether or not to raise allocated through sk_under_memory_pressure() or * its variants. */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg) { if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) goto suppress_allocation; charged = true; } /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } /* Under pressure. */ if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); /* Over hard limit. */ if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* Guarantee minimum buffer size under pressure (either global * or memcg) to make sure features described in RFC 7323 (TCP * Extensions for High Performance) work properly. * * This rule does NOT stand when exceeds global or memcg's hard * limit, or else a DoS attack can be taken place by spawning * lots of sockets whose usage are under minimum buffer size. */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ int wmem0 = sk_get_wmem0(sk, prot); if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < wmem0) return 1; } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; } } if (sk_has_memory_pressure(sk)) { u64 alloc; /* The following 'average' heuristic is within the * scope of global accounting, so it only makes * sense for global memory pressure. */ if (!sk_under_global_memory_pressure(sk)) return 1; /* Try to be fair among all the sockets under global * pressure by allowing the ones that below average * usage to raise. */ alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) return 1; } suppress_allocation: if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { sk_stream_moderate_sndbuf(sk); /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ if (memcg && !charged) { mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } return 1; } } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); if (charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; } /** * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated * @sk: socket * @size: memory size to allocate * @kind: allocation type * * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means * rmem allocation. This function assumes that protocols which have * memory_pressure use sk_wmem_queued as write buffer accounting. */ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket * @amount: number of quanta * * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) */ void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain * cases where it makes no sense for a protocol to have a "do nothing" * function, some default processing is provided. */ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); int sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_connect); int sock_no_socketpair(struct socket *sock1, struct socket *sock2) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_socketpair); int sock_no_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, int peer) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_getname); int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_ioctl); int sock_no_listen(struct socket *sock, int backlog) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_listen); int sock_no_shutdown(struct socket *sock, int how) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_shutdown); int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg_locked); int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_recvmsg); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { /* Mirror missing mmap method error code */ return -ENODEV; } EXPORT_SYMBOL(sock_no_mmap); /* * When a file is received (via SCM_RIGHTS, etc), we must bump the * various sock-based usage counts. */ void __receive_sock(struct file *file) { struct socket *sock; sock = sock_from_file(file); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); } } /* * Default Socket Callbacks */ static void sock_def_wakeup(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void sock_def_error_report(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } void sock_def_readable(struct sock *sk) { struct socket_wq *wq; trace_sk_data_ready(sk); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } static void sock_def_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* An optimised version of sock_def_write_space(), should only be called * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ static void sock_def_write_space_wfree(struct sock *sk) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ if (sock_writeable(sk)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ smp_mb__after_atomic(); if (wq && waitqueue_active(&wq->wait)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } static void sock_def_destruct(struct sock *sk) { } void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(sk->sk_socket->file)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); void sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires) { if (!mod_timer(timer, expires)) sock_hold(sk); } EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { if (del_timer(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { if (del_timer_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); } sk->sk_uid = uid; sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; spin_lock_init(&sk->sk_peer_lock); sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; #if BITS_PER_LONG==32 seqlock_init(&sk->sk_stamp_seq); #endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data_uid); void sock_init_data(struct socket *sock, struct sock *sk) { kuid_t uid = sock ? SOCK_INODE(sock)->i_uid : make_kuid(sock_net(sk)->user_ns, 0); sock_init_data_uid(sock, sk, uid); } EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); if (sk->sk_prot->release_cb) INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, tcp_release_cb, sk); sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (!sock_owned_by_user_nocheck(sk)) { /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held. * * The 'mutex' is not contended and holding * sock::sk_lock.slock prevents all other lockers to * proceed so the corresponding unlock_sock_fast() can * avoid the slow path of release_sock() completely and * just release slock. * * From a semantical POV this is equivalent to 'acquiring' * the 'mutex', hence the corresponding lockdep * mutex_release() has to happen in the fast path of * unlock_sock_fast(). */ return false; } __lock_sock(sk); sk->sk_lock.owned = 1; __acquire(&sk->sk_lock.slock); spin_unlock_bh(&sk->sk_lock.slock); return true; } EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) { struct sock *sk = sock->sk; struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec64(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } if (timeval) ts.tv_nsec /= 1000; #ifdef CONFIG_COMPAT_32BIT_TIME if (time32) return put_old_timespec32(&ts, userstamp); #endif #ifdef CONFIG_SPARC64 /* beware of padding in sparc64 timeval */ if (timeval && !in_compat_syscall()) { struct __kernel_old_timeval __user tv = { .tv_sec = ts.tv_sec, .tv_usec = ts.tv_nsec, }; if (copy_to_user(userstamp, &tv, sizeof(tv))) return -EFAULT; return 0; } #endif return put_timespec64(&ts, userstamp); } EXPORT_SYMBOL(sock_gettstamp); void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ if (sock_needs_netstamp(sk) && !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; struct sk_buff *skb; int copied, err; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); msg->msg_flags |= MSG_ERRQUEUE; err = copied; out_free_skb: kfree_skb(skb); out: return err; } EXPORT_SYMBOL(sock_recv_errqueue); /* * Get a socket option on an socket. * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume * this means if you specify SO_ERROR (otherwise what is the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(sock_common_recvmsg); /* * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) sk->sk_prot->destroy(sk); /* * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * * A. Remove from hash tables. */ sk->sk_prot->unhash(sk); /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and * did hash table lookup before we unhashed socket. They will achieve * receive queue and will be purged by socket destructor. * * Also we still have packets pending on receive queue and probably, * our own packets waiting in device queues. sock_destroy will drain * receive queue, but transmitted packets will delay socket destruction * until the last reference will be released. */ sock_orphan(sk); xfrm_sk_free_policy(sk); sock_put(sk); } EXPORT_SYMBOL(sk_common_release); void sk_get_meminfo(const struct sock *sk, u32 *mem) { memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } #ifdef CONFIG_PROC_FS static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); int sock_prot_inuse_get(struct net *net, struct proto *prot) { int cpu, idx = prot->inuse_idx; int res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); int sock_inuse_get(struct net *net) { int cpu, res = 0; for_each_possible_cpu(cpu) res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; return res; } EXPORT_SYMBOL_GPL(sock_inuse_get); static int __net_init sock_inuse_init_net(struct net *net) { net->core.prot_inuse = alloc_percpu(struct prot_inuse); if (net->core.prot_inuse == NULL) return -ENOMEM; return 0; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); } static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, }; static __init int net_inuse_init(void) { if (register_pernet_subsys(&net_inuse_ops)) panic("Cannot initialize net inuse counters"); return 0; } core_initcall(net_inuse_init); static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); return 0; } static void release_proto_idx(struct proto *prot) { if (prot->inuse_idx != PROTO_INUSE_NR - 1) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else static inline int assign_proto_idx(struct proto *prot) { return 0; } static inline void release_proto_idx(struct proto *prot) { } #endif static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) { if (!twsk_prot) return; kfree(twsk_prot->twsk_slab_name); twsk_prot->twsk_slab_name = NULL; kmem_cache_destroy(twsk_prot->twsk_slab); twsk_prot->twsk_slab = NULL; } static int tw_prot_init(const struct proto *prot) { struct timewait_sock_ops *twsk_prot = prot->twsk_prot; if (!twsk_prot) return 0; twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); if (!twsk_prot->twsk_slab_name) return -ENOMEM; twsk_prot->twsk_slab = kmem_cache_create(twsk_prot->twsk_slab_name, twsk_prot->twsk_obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!twsk_prot->twsk_slab) { pr_crit("%s: Can't create timewait sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { if (!rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; kmem_cache_destroy(rsk_prot->slab); rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) { struct request_sock_ops *rsk_prot = prot->rsk_prot; if (!rsk_prot) return 0; rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); if (!rsk_prot->slab_name) return -ENOMEM; rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, SLAB_ACCOUNT | prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", prot->name); return -ENOMEM; } return 0; } int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; if (prot->memory_allocated && !prot->sysctl_mem) { pr_err("%s: missing sysctl_mem\n", prot->name); return -EINVAL; } if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); return -EINVAL; } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, NULL); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); goto out; } if (req_prot_init(prot)) goto out_free_request_sock_slab; if (tw_prot_init(prot)) goto out_free_timewait_sock_slab; } mutex_lock(&proto_list_mutex); ret = assign_proto_idx(prot); if (ret) { mutex_unlock(&proto_list_mutex); goto out_free_timewait_sock_slab; } list_add(&prot->node, &proto_list); mutex_unlock(&proto_list_mutex); return ret; out_free_timewait_sock_slab: if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { req_prot_cleanup(prot->rsk_prot); kmem_cache_destroy(prot->slab); prot->slab = NULL; } out: return ret; } EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { mutex_lock(&proto_list_mutex); release_proto_idx(prot); list_del(&prot->node); mutex_unlock(&proto_list_mutex); kmem_cache_destroy(prot->slab); prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); int sock_load_diag_module(int family, int protocol) { if (!protocol) { if (!sock_is_registered(family)) return -ENOENT; return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family); } #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, NETLINK_SOCK_DIAG, family, protocol); } EXPORT_SYMBOL(sock_load_diag_module); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) __acquires(proto_list_mutex) { mutex_lock(&proto_list_mutex); return seq_list_start_head(&proto_list, *pos); } static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &proto_list, pos); } static void proto_seq_stop(struct seq_file *seq, void *v) __releases(proto_list_mutex) { mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } static long sock_prot_memory_allocated(struct proto *proto) { return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; } static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), sock_prot_memory_allocated(proto), sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), proto_method_implemented(proto->close), proto_method_implemented(proto->connect), proto_method_implemented(proto->disconnect), proto_method_implemented(proto->accept), proto_method_implemented(proto->ioctl), proto_method_implemented(proto->init), proto_method_implemented(proto->destroy), proto_method_implemented(proto->shutdown), proto_method_implemented(proto->setsockopt), proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), proto_method_implemented(proto->unhash), proto_method_implemented(proto->get_port), proto_method_implemented(proto->enter_memory_pressure)); } static int proto_seq_show(struct seq_file *seq, void *v) { if (v == &proto_list) seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", "protocol", "size", "sockets", "memory", "press", "maxhdr", "slab", "module", "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; } static const struct seq_operations proto_seq_ops = { .start = proto_seq_start, .next = proto_seq_next, .stop = proto_seq_stop, .show = proto_seq_show, }; static __net_init int proto_init_net(struct net *net) { if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static __net_exit void proto_exit_net(struct net *net) { remove_proc_entry("protocols", net->proc_net); } static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, }; static int __init proto_init(void) { return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); #endif /* PROC_FS */ #ifdef CONFIG_NET_RX_BUSY_POLL bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) return true; if (sk_is_udp(sk) && !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) return true; return sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); /* Copy 'size' bytes from userspace and return `size` back to userspace */ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size) { int ret; if (copy_from_user(karg, arg, size)) return -EFAULT; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); if (ret) return ret; if (copy_to_user(arg, karg, size)) return -EFAULT; return 0; } EXPORT_SYMBOL(sock_ioctl_inout); /* This is the most common ioctl prep function, where the result (4 bytes) is * copied back to userspace if the ioctl() returns successfully. No input is * copied from userspace as input argument. */ static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) { int ret, karg = 0; ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); if (ret) return ret; return put_user(karg, (int __user *)arg); } /* A wrapper around sock ioctls, which copies the data from userspace * (depending on the protocol/ioctl), and copies back the result to userspace. * The main motivation for this function is to pass kernel memory to the * protocol ioctl callbacks, instead of userspace memory. */ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { int rc = 1; if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) rc = ipmr_sk_ioctl(sk, cmd, arg); else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) rc = ip6mr_sk_ioctl(sk, cmd, arg); else if (sk_is_phonet(sk)) rc = phonet_sk_ioctl(sk, cmd, arg); /* If ioctl was processed, returns its value */ if (rc <= 0) return rc; /* Otherwise call the default handler */ return sock_ioctl_out(sk, cmd, arg); } EXPORT_SYMBOL(sk_ioctl); static int __init sock_struct_check(void) { CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); return 0; } core_initcall(sock_struct_check);
915 52 1 869 867 1 128 128 11 118 880 453 34 853 881 124 123 124 124 109 109 15 124 129 9 128 16 113 16 16 16 16 16 50 896 867 866 867 118 118 8 852 854 799 116 923 923 525 57 534 536 30 30 30 7 1 6 6 828 111 878 798 877 878 798 877 800 610 811 648 648 4 4 428 429 431 429 428 456 455 454 4 177 177 176 934 933 934 934 525 615 615 615 934 615 526 535 884 535 551 911 808 806 234 767 237 801 801 2 136 408 832 813 814 72 72 832 798 799 1002 820 820 820 23 810 820 689 168 811 9 813 6 20 801 828 22 2 820 808 23 23 23 71 73 73 64 9 70 3 71 2 73 1 72 66 34 73 6 841 35 35 1 1 1 73 72 73 1 72 5 5 2 3 5 73 928 923 289 290 290 289 52 279 280 225 62 279 278 2 280 280 228 30 221 220 863 827 525 525 539 453 59 21 57 57 73 8 73 73 73 8 72 26 25 47 47 55 22 3 3 22 73 73 25 70 42 42 41 5 42 42 25 15 43 784 784 783 784 1 783 785 784 785 781 6 771 19 785 785 13 785 784 785 819 820 820 785 501 455 784 29 809 809 73 29 785 776 785 47 767 808 809 73 29 785 776 788 788 752 24 20 786 788 787 788 809 809 809 809 809 12 7 49 761 13 32 32 32 6 30 26 26 26 14 18 32 7 2 5 30 30 2 28 2 1 30 18 12 480 463 18 18 39 455 438 18 455 455 489 489 488 28 28 49 455 437 18 481 771 48 757 757 809 807 47 48 776 809 807 183 791 791 788 789 791 791 136 791 13 785 785 785 784 783 808 809 546 546 809 809 808 808 764 807 536 91 512 785 809 547 809 809 2 2 2 2 2 2 27 7 27 27 17 17 17 17 17 17 11 11 2 9 9 9 1 8 8 11 21 4 18 12 12 550 550 927 913 22 924 928 853 63 63 62 454 4 4 525 4 864 60 56 8 811 354 209 734 811 811 115 40 92 92 92 92 12 79 5 5 848 145 5 9 8 140 10 142 6 143 7 144 4 142 8 145 5 141 9 142 142 8 145 5 141 9 21 129 150 888 889 7 156 809 809 148 3 2 149 149 150 148 150 8 138 12 149 36 114 24 125 77 73 77 72 149 85 85 72 72 73 73 73 72 8 72 73 73 73 73 8 8 8 8 72 8 72 73 22 58 57 31 31 31 15 11 2 9 9 9 5 9 1 13 12 2 18 7 3 14 3 4 2 7 11 78 25 94 10 4 4 4 4 11 13 33 31 30 356 355 252 252 252 252 252 4 4 4 4 4 4 8 8 8 8 8 7 31 31 31 31 31 31 31 756 453 8 464 2 105 105 105 16 759 588 177 176 681 82 668 97 696 69 424 28 34 28 37 1 26 12 12 1 1 1 8 9 1 8 8 8 62 62 758 42 282 453 669 62 703 27 729 269 466 355 2 626 105 105 707 23 678 50 686 41 328 406 714 667 64 727 721 9 729 713 16 16 756 757 756 757 756 757 755 755 3 3 200 200 9 197 245 245 1 11 245 245 245 245 245 245 245 31 22 14 14 200 27 207 208 22 20 7 7 28 207 199 28 28 28 28 28 28 28 10 8 8 8 50 11 11 1 1 1 1 117 114 21 21 184 43 144 144 53 50 13 13 143 2037 2020 143 2027 65 35 32 12 12 1542 1520 784 756 682 1 755 3 2 14 1 9 9 9 2 2 5 2 71 4 756 755 758 9 757 742 14 14 5 6 11 5 756 4 757 389 113 371 8 1 264 4624 1478 3095 487 3086 13 3082 264 128 126 116 471 42 471 471 471 2 471 116 469 6725 6886 471 471 471 471 471 471 6 42 42 8 36 42 42 12 12 1 1 60 18 18 20 4 18 20 55 55 55 55 20 20 20 20 306 39 1 201 13 3 1 55 77 614 39 2 9 573 4547 4544 4547 6727 4245 4813 4 8 31 31 10 10 4596 4599 4545 4541 4534 141 141 129 31 141 88 39 49 1 20 32 114 11 103 102 100 2 75 76 13 13 13 150 149 7 3 31 176 67 28 55 171 94 94 4 185 4 83 182 182 80 79 11 254 39 1 1 215 8 8 3 13 3 5 11 11 2 2 3 55 39 2 1 13 876 782 785 12 10 1023 1023 5 1021 5 344 909 10 911 6 919 919 21 1008 29 1009 34 1008 1009 980 980 39 39 1026 14 14 851 14 23 882 29 833 31 840 22 852 11 836 28 844 20 849 14 843 23 838 27 840 27 848 16 844 21 64 805 22 141 863 1042 13 668 415 1029 1009 29 108 937 1011 29 29 42 973 1031 13 1016 1029 160 893 5 896 888 7 888 29 769 120 882 19 4 15 40 142 1110 31 7 1079 3 6 1 2 1055 3 5 11 3 13 1016 60 986 100 539 603 1043 27 2 1064 18 1 2 12 2 2 8 4 8 4 29 4 4 11 796 30 30 30 30 1077 2 54 38 999 1000 989 47 41 1001 988 50 51 1008 22 1027 1 991 2 64 928 1 925 862 194 346 557 825 41 852 796 4 796 62 831 2 2 3 11 2 22 841 8 841 841 809 809 808 51 763 808 63 11 205 1025 31 6 24 4 11 1 9 13 1003 29 29 22 29 29 7 29 182 163 8 1 29 29 184 186 186 170 8 17 19 284 284 186 60 60 60 60 30 29 31 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368 13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429 13430 13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502 13503 13504 13505 13506 13507 13508 13509 13510 13511 13512 13513 13514 13515 13516 13517 13518 13519 13520 13521 13522 13523 13524 13525 13526 13527 13528 13529 13530 13531 13532 13533 13534 13535 13536 13537 13538 13539 13540 13541 13542 13543 13544 13545 13546 13547 13548 13549 13550 13551 13552 13553 13554 13555 13556 13557 13558 13559 13560 13561 13562 13563 13564 13565 13566 13567 13568 13569 13570 13571 13572 13573 13574 13575 13576 13577 13578 13579 13580 13581 13582 13583 13584 13585 13586 13587 13588 13589 13590 13591 13592 13593 13594 13595 13596 13597 13598 13599 13600 13601 13602 13603 13604 13605 13606 13607 13608 13609 13610 13611 13612 13613 13614 13615 13616 13617 13618 13619 13620 13621 13622 13623 13624 13625 13626 13627 13628 13629 13630 13631 13632 13633 13634 13635 13636 13637 13638 13639 13640 13641 13642 13643 13644 13645 13646 13647 13648 13649 13650 13651 13652 13653 13654 13655 13656 13657 13658 13659 13660 13661 13662 13663 13664 13665 13666 13667 13668 13669 13670 13671 13672 13673 13674 13675 13676 13677 13678 13679 13680 13681 13682 13683 13684 13685 13686 13687 13688 13689 13690 13691 13692 13693 13694 13695 13696 13697 13698 13699 13700 13701 13702 13703 13704 13705 13706 13707 13708 13709 13710 13711 13712 13713 13714 13715 13716 13717 13718 13719 13720 13721 13722 13723 13724 13725 13726 13727 13728 13729 13730 13731 13732 13733 13734 13735 13736 13737 13738 13739 13740 13741 13742 13743 13744 13745 13746 13747 13748 13749 13750 13751 13752 13753 13754 13755 13756 13757 13758 13759 13760 13761 13762 13763 13764 13765 13766 13767 13768 13769 13770 13771 13772 13773 13774 13775 13776 13777 13778 13779 13780 13781 13782 13783 13784 13785 13786 13787 13788 13789 13790 13791 13792 13793 13794 13795 13796 13797 13798 13799 13800 13801 13802 13803 13804 13805 13806 13807 13808 13809 13810 13811 13812 13813 13814 13815 13816 13817 13818 13819 13820 13821 13822 13823 13824 13825 13826 13827 13828 13829 13830 13831 13832 13833 13834 13835 13836 13837 13838 13839 13840 13841 13842 13843 13844 13845 13846 13847 13848 13849 13850 13851 13852 13853 13854 13855 13856 13857 13858 13859 13860 13861 13862 13863 13864 13865 13866 13867 13868 13869 13870 13871 13872 13873 13874 13875 13876 13877 13878 13879 13880 13881 13882 13883 13884 13885 13886 13887 13888 13889 13890 13891 13892 13893 13894 13895 13896 13897 13898 13899 13900 13901 13902 13903 13904 13905 13906 13907 13908 13909 13910 13911 13912 13913 13914 13915 13916 13917 13918 13919 13920 13921 13922 13923 13924 13925 13926 13927 13928 13929 13930 13931 13932 13933 13934 13935 13936 13937 13938 13939 13940 13941 13942 13943 13944 13945 13946 13947 13948 13949 13950 13951 13952 13953 13954 13955 13956 13957 13958 13959 13960 13961 13962 13963 13964 13965 13966 13967 13968 13969 13970 13971 13972 13973 13974 13975 13976 13977 13978 13979 13980 13981 13982 13983 13984 13985 13986 13987 13988 13989 13990 13991 13992 13993 13994 13995 13996 13997 13998 13999 14000 14001 14002 14003 14004 14005 14006 14007 14008 14009 14010 14011 14012 14013 14014 14015 14016 14017 14018 14019 14020 14021 14022 14023 14024 14025 14026 14027 14028 14029 14030 14031 14032 14033 14034 14035 14036 14037 14038 14039 14040 14041 14042 14043 14044 14045 14046 14047 14048 14049 14050 14051 14052 14053 14054 14055 14056 14057 14058 14059 14060 14061 14062 14063 14064 14065 14066 14067 14068 14069 14070 14071 14072 14073 14074 14075 14076 14077 14078 14079 14080 14081 14082 14083 14084 14085 14086 14087 14088 14089 14090 14091 14092 14093 14094 14095 14096 14097 14098 14099 14100 14101 14102 14103 14104 14105 14106 14107 14108 14109 14110 14111 14112 14113 14114 14115 14116 14117 14118 14119 14120 14121 14122 14123 14124 14125 14126 14127 14128 14129 14130 14131 14132 14133 14134 14135 14136 14137 14138 14139 14140 14141 14142 14143 14144 14145 14146 14147 14148 14149 14150 14151 14152 14153 14154 14155 14156 14157 14158 14159 14160 14161 14162 14163 14164 14165 14166 14167 14168 14169 14170 14171 14172 14173 14174 14175 14176 14177 14178 14179 14180 14181 14182 14183 14184 14185 14186 14187 14188 14189 14190 14191 14192 14193 14194 14195 14196 14197 14198 14199 14200 14201 14202 14203 14204 14205 14206 14207 14208 14209 14210 14211 14212 14213 14214 14215 14216 14217 14218 14219 14220 14221 14222 14223 14224 14225 14226 14227 14228 14229 14230 14231 14232 14233 14234 14235 14236 14237 14238 14239 14240 14241 14242 14243 14244 14245 14246 14247 14248 14249 14250 14251 14252 14253 14254 14255 14256 14257 14258 14259 14260 14261 14262 14263 14264 14265 14266 14267 14268 14269 14270 14271 14272 14273 14274 14275 14276 14277 14278 14279 14280 // SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/namei.h> #include <linux/parser.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> #include "internal.h" #include <asm/irq_regs.h> typedef int (*remote_function_f)(void *); struct remote_function_call { struct task_struct *p; remote_function_f func; void *info; int ret; }; static void remote_function(void *data) { struct remote_function_call *tfc = data; struct task_struct *p = tfc->p; if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return; /* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races. */ tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return; } tfc->ret = tfc->func(tfc->info); } /** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, .func = func, .info = info, .ret = -EAGAIN, }; int ret; for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); if (!ret) ret = data.ret; if (ret != -EAGAIN) break; cond_resched(); } return ret; } /** * cpu_function_call - call a function on the cpu * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline */ static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, .func = func, .info = info, .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); return data.ret; } enum event_type_t { EVENT_FLEXIBLE = 0x01, EVENT_PINNED = 0x02, EVENT_TIME = 0x04, EVENT_FROZEN = 0x08, /* see ctx_resched() for details */ EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, }; static inline void __perf_ctx_lock(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { __perf_ctx_lock(&cpuctx->ctx); if (ctx) __perf_ctx_lock(ctx); } static inline void __perf_ctx_unlock(struct perf_event_context *ctx) { /* * If ctx_sched_in() didn't again set any ALL flags, clean up * after ctx_sched_out() by clearing is_active. */ if (ctx->is_active & EVENT_FROZEN) { if (!(ctx->is_active & EVENT_ALL)) ctx->is_active = 0; else ctx->is_active &= ~EVENT_FROZEN; } raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) __perf_ctx_unlock(ctx); __perf_ctx_unlock(&cpuctx->ctx); } #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { return READ_ONCE(event->owner) == TASK_TOMBSTONE; } static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); struct perf_event_context *perf_cpu_task_ctx(void) { lockdep_assert_irqs_disabled(); return this_cpu_ptr(&perf_cpu_context)->task_ctx; } /* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, struct perf_event_context *, void *); struct event_function_struct { struct perf_event *event; event_f func; void *data; }; static int event_function(void *info) { struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. */ if (ctx->task) { if (ctx->task != current) { ret = -ESRCH; goto unlock; } /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here. */ WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ WARN_ON_ONCE(task_ctx != ctx); } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } efs->func(event, cpuctx, ctx, efs->data); unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, .data = data, }; if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); } if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; } if (task == TASK_TOMBSTONE) return; again: if (!task_function_call(task, event_function, &efs)) return; local_irq_disable(); cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (ctx->is_active) { perf_ctx_unlock(cpuctx, ctx); local_irq_enable(); goto again; } func(event, NULL, ctx, data); unlock: perf_ctx_unlock(cpuctx, ctx); local_irq_enable(); } /* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU. */ static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) return; task_ctx = ctx; } perf_ctx_lock(cpuctx, task_ctx); task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else. */ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock; if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) goto unlock; } } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } func(event, cpuctx, ctx, data); unlock: perf_ctx_unlock(cpuctx, task_ctx); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks */ #define PERF_SAMPLE_BRANCH_PERM_PLM \ (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) /* * perf_sched_events : >0 events exist */ static void perf_sched_delayed(struct work_struct *work); DEFINE_STATIC_KEY_FALSE(perf_sched_events); static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; static cpumask_var_t perf_online_core_mask; static cpumask_var_t perf_online_die_mask; static cpumask_var_t perf_online_cluster_mask; static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate */ #define DEFAULT_MAX_SAMPLE_RATE 100000 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; tmp = div_u64(tmp, 100); if (!tmp) tmp = 1; WRITE_ONCE(perf_sample_allowed_ns, tmp); } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); return 0; } int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; if (sysctl_perf_cpu_time_max_percent == 100 || sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); } else { update_perf_cpu_limits(); } return 0; } /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); static u64 __report_avg; static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { u64 max_len = READ_ONCE(perf_sample_allowed_ns); u64 running_len; u64 avg_len; u32 max; if (max_len == 0) return; /* Decay the counter by 1 average sample. */ running_len = __this_cpu_read(running_sample_length); running_len -= running_len/NR_ACCUMULATED_SAMPLES; running_len += sample_len_ns; __this_cpu_write(running_sample_length, running_len); /* * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return; __report_avg = avg_len; __report_allowed = max_len; /* * Compute a throttle threshold 25% below the current duration. */ avg_len += avg_len / 4; max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max) max /= (u32)avg_len; else max = 1; WRITE_ONCE(perf_sample_allowed_ns, avg_len); WRITE_ONCE(max_samples_per_tick, max); sysctl_perf_event_sample_rate = max * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } static atomic64_t perf_event_id; static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } static inline u64 perf_clock(void) { return local_clock(); } static inline u64 perf_event_clock(struct perf_event *event) { return event->clock(); } /* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespective of what the group member states are. This results in * __perf_effective_state(). * * A further ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps. */ static __always_inline enum perf_event_state __perf_effective_state(struct perf_event *event) { struct perf_event *leader = event->group_leader; if (leader->state <= PERF_EVENT_STATE_OFF) return leader->state; return event->state; } static __always_inline void __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) { enum perf_event_state state = __perf_effective_state(event); u64 delta = now - event->tstamp; *enabled = event->total_time_enabled; if (state >= PERF_EVENT_STATE_INACTIVE) *enabled += delta; *running = event->total_time_running; if (state >= PERF_EVENT_STATE_ACTIVE) *running += delta; } static void perf_event_update_time(struct perf_event *event) { u64 now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); event->tstamp = now; } static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } static void perf_event_set_state(struct perf_event *event, enum perf_event_state state) { if (event->state == state) return; perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too. */ if ((event->state < 0) ^ (state < 0)) perf_event_update_sibling_time(event); WRITE_ONCE(event->state, state); } /* * UP store-release, load-acquire */ #define __store_release(ptr, val) \ do { \ barrier(); \ WRITE_ONCE(*(ptr), (val)); \ } while (0) #define __load_acquire(ptr) \ ({ \ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ barrier(); \ ___p; \ }) #define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ if (_cgroup && !_epc->nr_cgroups) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) return true; /* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) return false; /* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match. */ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, event->cgrp->css.cgroup); } static inline void perf_detach_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); event->cgrp = NULL; } static inline int is_cgroup_event(struct perf_event *event) { return event->cgrp != NULL; } static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); return t->time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) return t->time; now += READ_ONCE(t->timeoffset); return now; } static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) { if (adv) info->time += now - info->timestamp; info->timestamp = now; /* * see update_context_time() */ WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; struct perf_cgroup_info *info; if (cgrp) { u64 now = perf_clock(); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, now, true); if (final) __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get) */ if (!is_cgroup_event(event)) return; info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ if (info->active) __update_cgrp_time(info, perf_clock(), true); } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; /* * ctx->lock held by caller * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ if (!cgrp) return; WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, ctx->timestamp, false); __store_release(&info->active, 1); } } /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; /* * cpuctx->cgrp is set when the first cgroup event enabled, * and is cleared when the last cgroup event disabled. */ if (READ_ONCE(cpuctx->cgrp) == NULL) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in * ctx_sched_out() */ cpuctx->cgrp = cgrp; /* * set cgrp before ctxsw in to allow * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0; /* * Allow storage to have sufficient space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) heap_size++; for_each_possible_cpu(cpu) { cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; storage = kmalloc_node(heap_size * sizeof(struct perf_event *), GFP_KERNEL, cpu_to_node(cpu)); if (!storage) { ret = -ENOMEM; break; } raw_spin_lock_irq(&cpuctx->ctx.lock); if (cpuctx->heap_size < heap_size) { swap(cpuctx->heap, storage); if (storage == cpuctx->heap_default) storage = NULL; cpuctx->heap_size = heap_size; } raw_spin_unlock_irq(&cpuctx->ctx.lock); kfree(storage); } return ret; } static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; CLASS(fd, f)(fd); int ret = 0; if (fd_empty(f)) return -EBADF; css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; /* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time */ if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; } return ret; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups++; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (ctx->nr_cgroups++) return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups--; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (--ctx->nr_cgroups) return; cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ static inline bool perf_cgroup_match(struct perf_event *event) { return true; } static inline void perf_detach_cgroup(struct perf_event *event) {} static inline int is_cgroup_event(struct perf_event *event) { return 0; } static inline void update_cgrp_time_from_event(struct perf_event *event) { } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { } static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { return -EINVAL; } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } static inline u64 perf_cgroup_event_time(struct perf_event *event) { return 0; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } static void perf_cgroup_switch(struct task_struct *task) { } #endif /* * set default to be dependent on timer tick just * like original code */ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); rotations = perf_rotate_context(cpc); raw_spin_lock(&cpc->hrtimer_lock); if (rotations) hrtimer_forward_now(hr, cpc->hrtimer_interval); else cpc->hrtimer_active = 0; raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { struct hrtimer *timer = &cpc->hrtimer; struct pmu *pmu = cpc->epc.pmu; u64 interval; /* * check default is sane, if not set then force to * default interval (1/tick) */ interval = pmu->hrtimer_interval_ms; if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); if (!cpc->hrtimer_active) { cpc->hrtimer_active = 1; hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } static int perf_mux_hrtimer_restart_ipi(void *arg) { return perf_mux_hrtimer_restart(arg); } void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); } static void *alloc_task_ctx_data(struct pmu *pmu) { if (pmu->task_ctx_cache) return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); return NULL; } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { if (pmu->task_ctx_cache && task_ctx_data) kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); } static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx); } static void put_ctx(struct perf_event_context *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } } /* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * mmap_lock * perf_event::mmap_mutex * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { struct perf_event_context *ctx; again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); mutex_lock_nested(&ctx->mutex, nesting); if (event->ctx != ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } return ctx; } static inline struct perf_event_context * perf_event_ctx_lock(struct perf_event *event) { return perf_event_ctx_lock_nested(event, 0); } static void perf_event_ctx_unlock(struct perf_event *event, struct perf_event_context *ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); } /* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. */ static __must_check struct perf_event_context * unclone_ctx(struct perf_event_context *ctx) { struct perf_event_context *parent_ctx = ctx->parent_ctx; lockdep_assert_held(&ctx->lock); if (parent_ctx) ctx->parent_ctx = NULL; ctx->generation++; return parent_ctx; } static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type) { u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p)) nr = -1; return nr; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_PID); } /* * If we inherit events we want to return the parent event id * to userspace. */ static u64 primary_event_id(struct perf_event *event) { u64 id = event->id; if (event->parent) id = event->parent->id; return id; } /* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled. */ local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); goto retry; } if (ctx->task == TASK_TOMBSTONE || !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); if (!ctx) local_irq_restore(*flags); return ctx; } /* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ static struct perf_event_context * perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } /* * Update the record of the current time in a context. */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); lockdep_assert_held(&ctx->lock); if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; /* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner. */ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time(event); return ctx->time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) return ctx->time; now += READ_ONCE(ctx->timeoffset); return now; } static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; enum event_type_t event_type; lockdep_assert_held(&ctx->lock); /* * It's 'group type', really, because if our group leader is * pinned, so are we. */ if (event->group_leader != event) event = event->group_leader; event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; return event_type; } /* * Helper function to initialize event group nodes. */ static void init_event_group(struct perf_event *event) { RB_CLEAR_NODE(&event->group_node); event->group_index = 0; } /* * Extract pinned or flexible groups from the context * based on event attrs bits. */ static struct perf_event_groups * get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups; } /* * Helper function to initializes perf_event_group trees. */ static void perf_event_groups_init(struct perf_event_groups *groups) { groups->tree = RB_ROOT; groups->index = 0; } static inline struct cgroup *event_cgroup(const struct perf_event *event) { struct cgroup *cgroup = NULL; #ifdef CONFIG_CGROUP_PERF if (event->cgrp) cgroup = event->cgrp->css.cgroup; #endif return cgroup; } /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ static __always_inline int perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, const struct cgroup *left_cgroup, const u64 left_group_index, const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; if (left_pmu) { if (left_pmu < right->pmu_ctx->pmu) return -1; if (left_pmu > right->pmu_ctx->pmu) return 1; } #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); if (left_cgroup != right_cgroup) { if (!left_cgroup) { /* * Left has no cgroup but right does, no * cgroups come first. */ return -1; } if (!right_cgroup) { /* * Right has no cgroup but left does, no * cgroups come first. */ return 1; } /* Two dissimilar cgroups, order by id. */ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) return -1; return 1; } } #endif if (left_group_index < right->group_index) return -1; if (left_group_index > right->group_index) return 1; return 0; } #define __node_2_pe(node) \ rb_entry((node), struct perf_event, group_node) static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; struct pmu *pmu; struct cgroup *cgroup; }; static inline int __group_cmp(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); } static inline int __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), b->group_index, b); } /* * Insert @event into @groups' tree; using * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { event->group_index = ++groups->index; rb_add(&event->group_node, &groups->tree, __group_less); } /* * Helper function to insert event into the pinned or flexible groups. */ static void add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_insert(groups, event); } /* * Delete a group from a tree. */ static void perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event) { WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || RB_EMPTY_ROOT(&groups->tree)); rb_erase(&event->group_node, &groups->tree); init_event_group(event); } /* * Helper function to delete event from its groups. */ static void del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_delete(groups, event); } /* * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; node = rb_find_first(&key, &groups->tree, __group_cmp); if (node) return __node_2_pe(node); return NULL; } static struct perf_event * perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; next = rb_next_match(&key, &event->group_node, __group_cmp); if (next) return __node_2_pe(next); return NULL; } #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ event; event = perf_event_groups_next(event, pmu)) /* * Iterate through the whole groups tree. */ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ typeof(*event), group_node); event; \ event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) /* * Does the event attribute request inherit with PERF_SAMPLE_READ */ static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) { return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); } /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; event->tstamp = perf_event_time(event); /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { event->group_caps = event->event_caps; add_event_to_groups(event, ctx); } list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; if (has_inherit_and_sample_read(&event->attr)) local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); ctx->generation++; event->pmu_ctx->nr_events++; } /* * Initialize event state based on the perf_event_attr::disabled. */ static inline void perf_event__state_init(struct perf_event *event) { event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : PERF_EVENT_STATE_INACTIVE; } static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } /* * Since perf_event_validate_size() limits this to 16k and inhibits * adding more siblings, this will never overflow. */ return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; u16 size = 0; if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); if (sample_type & PERF_SAMPLE_ADDR) size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) size += sizeof(data->code_page_size); event->header_size = size; } /* * Called at perf_event creation and when events are attached/detached from a * group. */ static void perf_event__header_size(struct perf_event *event) { event->read_size = __perf_event_read_size(event->attr.read_format, event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; u64 sample_type = event->attr.sample_type; u16 size = 0; if (sample_type & PERF_SAMPLE_TID) size += sizeof(data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); if (sample_type & PERF_SAMPLE_IDENTIFIER) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) size += sizeof(data->stream_id); if (sample_type & PERF_SAMPLE_CPU) size += sizeof(data->cpu_entry); event->id_header_size = size; } /* * Check that adding an event to the group does not result in anybody * overflowing the 64k event limit imposed by the output buffer. * * Specifically, check that the read_size for the event does not exceed 16k, * read_size being the one term that grows with groups size. Since read_size * depends on per-event read_format, also (re)check the existing events. * * This leaves 48k for the constant size fields and things like callchains, * branch stacks and register sets. */ static bool perf_event_validate_size(struct perf_event *event) { struct perf_event *sibling, *group_leader = event->group_leader; if (__perf_event_read_size(event->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; if (__perf_event_read_size(group_leader->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; /* * When creating a new group leader, group_leader->ctx is initialized * after the size has been validated, but we cannot safely use * for_each_sibling_event() until group_leader->ctx is set. A new group * leader cannot have any siblings yet, so we can safely skip checking * the non-existent siblings. */ if (event == group_leader) return true; for_each_sibling_event(sibling, group_leader) { if (__perf_event_read_size(sibling->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; } return true; } static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; lockdep_assert_held(&event->ctx->lock); /* * We can have double attach due to group movement (move_group) in * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) return; WARN_ON_ONCE(group_leader->ctx != event->ctx); group_leader->group_caps &= event->event_caps; list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; group_leader->group_generation++; perf_event__header_size(group_leader); for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } /* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; if (has_inherit_and_sample_read(&event->attr)) local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); if (event->group_leader == event) del_event_from_groups(event, ctx); /* * If event was in error state, then keep it * that way, otherwise bogus counts will be * returned on read(). The only way to get out * of error state is by explicit re-enabling * of the event */ if (event->state > PERF_EVENT_STATE_OFF) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); } ctx->generation++; event->pmu_ctx->nr_events--; } static int perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) { if (!has_aux(aux_event)) return 0; if (!event->pmu->aux_output_match) return 0; return event->pmu->aux_output_match(aux_event); } static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *iter; /* * If event uses aux_event tear down the link */ if (event->aux_event) { iter = event->aux_event; event->aux_event = NULL; put_event(iter); return; } /* * If the event is an aux_event, tear down all links to * it from other events. */ for_each_sibling_event(iter, event->group_leader) { if (iter->aux_event != event) continue; iter->aux_event = NULL; put_event(event); /* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } static bool perf_need_aux_event(struct perf_event *event) { return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first. */ if (!group_leader) return 0; /* * aux_output and aux_sample_size are mutually exclusive. */ if (event->attr.aux_output && event->attr.aux_sample_size) return 0; if (event->attr.aux_output && !perf_aux_output_match(event, group_leader)) return 0; if ((event->attr.aux_pause || event->attr.aux_resume) && !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) return 0; if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0; /* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more. */ event->aux_event = group_leader; return 1; } static inline struct list_head *get_event_list(struct perf_event *event) { return event->attr.pinned ? &event->pmu_ctx->pinned_active : &event->pmu_ctx->flexible_active; } /* * Events that have PERF_EV_CAP_SIBLING require being part of a group and * cannot exist on their own, schedule them out and move them into the ERROR * state. Also see _perf_event_enable(), it will not be able to recover * this ERROR state. */ static inline void perf_remove_sibling_event(struct perf_event *event) { event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_GROUP)) return; event->attach_state &= ~PERF_ATTACH_GROUP; perf_put_aux_event(event); /* * If this is a sibling, remove it from its group. */ if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; event->group_leader->group_generation++; goto out; } /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { if (sibling->event_caps & PERF_EV_CAP_SIBLING) perf_remove_sibling_event(sibling); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; if (sibling->attach_state & PERF_ATTACH_CONTEXT) { add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); } out: for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); perf_event__header_size(leader); } static void sync_child_event(struct perf_event *child_event); static void perf_child_detach(struct perf_event *event) { struct perf_event *parent_event = event->parent; if (!(event->attach_state & PERF_ATTACH_CHILD)) return; event->attach_state &= ~PERF_ATTACH_CHILD; if (WARN_ON_ONCE(!parent_event)) return; lockdep_assert_held(&parent_event->child_mutex); sync_child_event(event); list_del_init(&event->child_list); } static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && perf_cgroup_match(event); } static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state != PERF_EVENT_STATE_ACTIVE) return; /* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context(). */ list_del_init(&event->active_list); perf_pmu_disable(event->pmu); event->pmu->del(event, 0); event->oncpu = -1; if (event->pending_disable) { event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); if (!is_software_event(event)) cpc->active_oncpu--; if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq--; epc->nr_freq--; } if (event->attr.exclusive || !cpc->active_oncpu) cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) event_sched_out(event, ctx); } static inline void __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { __ctx_time_update(cpuctx, ctx, false); } /* * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). */ static inline void ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { ctx_time_update(cpuctx, ctx); if (ctx->is_active & EVENT_TIME) ctx->is_active |= EVENT_FROZEN; } static inline void ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; update_context_time(ctx); update_cgrp_time_from_event(event); } } #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ static void __perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); /* * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ if (flags & DETACH_DEAD) event->pending_disable = 1; event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); if (flags & DETACH_DEAD) event->state = PERF_EVENT_STATE_DEAD; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } } if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; } } } /* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->mutex); /* * Because of perf_event_exit_task(), perf_remove_from_context() ought * to work in the face of TASK_TOMBSTONE, unlike every other * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); if (!ctx->is_active) { __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ static void __perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { if (event->state < PERF_EVENT_STATE_INACTIVE) return; perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); if (event == event->group_leader) group_sched_out(event, ctx); else event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); perf_pmu_enable(event->pmu_ctx->pmu); } /* * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_disable, NULL); } void perf_event_disable_local(struct perf_event *event) { event_function_local(event, __perf_event_disable, NULL); } /* * Strictly speaking kernel users cannot create groups and therefore this * interface does not need the perf_event_ctx_lock() magic. */ void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_disable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) return 0; WRITE_ONCE(event->oncpu, smp_processor_id()); /* * Order event::oncpu write to happen before the ACTIVE state is * visible. This allows perf_event_{stop,read}() to observe the correct * ->oncpu if it sees ACTIVE. */ smp_wmb(); perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { perf_log_throttle(event, 1); event->hw.interrupts = 0; } perf_pmu_disable(event->pmu); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } if (!is_software_event(event)) cpc->active_oncpu++; if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq++; epc->nr_freq++; } if (event->attr.exclusive) cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); return ret; } static int group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } } if (!pmu->commit_txn(pmu)) return 0; group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: * The events up to the failed event are scheduled out normally. */ for_each_sibling_event(event, group_event) { if (event == partial_group) break; event_sched_out(event, ctx); } event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); return -EAGAIN; } /* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); /* * Groups consisting entirely of software events can always go on. */ if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware * events can go on. */ if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on. */ if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able * to go on. */ return can_add_hw; } static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); perf_group_attach(event); } static void task_ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct pmu *pmu) { ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) ctx_sched_in(ctx, pmu, EVENT_PINNED); ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* * We want to maintain the following priority of scheduling: * - CPU pinned (EVENT_CPU | EVENT_PINNED) * - task pinned (EVENT_PINNED) * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) * - task flexible (EVENT_FLEXIBLE). * * In order to avoid unscheduling and scheduling back in everything every * time an event is added, only do it for the groups of equal priority and * below. * * This can be called after a batch operation on task events, in which case * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be * scheduled out. */ if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; event_type &= EVENT_ALL; for_each_epc(epc, &cpuctx->ctx, pmu, false) perf_pmu_disable(epc->pmu); if (task_ctx) { for_each_epc(epc, task_ctx, pmu, false) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); } /* * Decide which cpu ctx groups to schedule out based on the types * of events that caused rescheduling: * - EVENT_CPU: schedule out corresponding groups; * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; * - otherwise, do nothing more. */ if (cpu_event) ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx, pmu); for_each_epc(epc, &cpuctx->ctx, pmu, false) perf_pmu_enable(epc->pmu); if (task_ctx) { for_each_epc(epc, task_ctx, pmu, false) perf_pmu_enable(epc->pmu); } } void perf_pmu_resched(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } /* * Cross CPU call to install and enable a performance event * * Very similar to remote_function() + event_function() but cannot assume that * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; reprogram = (ctx->task == current); /* * If the task is running, it must be running on this CPU, * otherwise we cannot reprogram things. * * If its not running, we don't care, ctx->lock will * serialize against it becoming runnable. */ if (task_curr(ctx->task) && !reprogram) { ret = -ESRCH; goto unlock; } WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } #ifdef CONFIG_CGROUP_PERF if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. */ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); reprogram = cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup); } #endif if (reprogram) { ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx); /* * Attach a performance event to a context. * * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx * will be 'complete'. See perf_iterate_sb_cpu(). */ smp_store_release(&event->ctx, ctx); /* * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); return; } if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; } /* * Should not happen, we validate the ctx is still alive before calling. */ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. * * Instead we use task_curr(), which tells us if the task is running. * However, since we use task_curr() outside of rq::lock, we can race * against the actual state. This means the result can be wrong. * * If we get a false positive, we retry, this is harmless. * * If we get a false negative, things are complicated. If we are after * perf_event_context_sched_in() ctx::lock will serialize us, and the * value must be correct. If we're before, it doesn't matter since * perf_event_context_sched_in() will program the counter. * * However, this hinges on the remote context switch having observed * our task->perf_event_ctxp[] store, such that it will in fact take * ctx::lock in perf_event_context_sched_in(). * * We do this by task_function_call(), if the IPI fails to hit the task * we know any future context switch of task must see the * perf_event_ctpx[] store. */ /* * This smp_mb() orders the task->perf_event_ctxp[] store with the * task_cpu() load, such that if the IPI then does not find the task * running, a future context switch of that task must observe the * store. */ smp_mb(); again: if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* * Cannot happen because we already checked above (which also * cannot happen), and we hold ctx->mutex, which serializes us * against perf_event_exit_task_context(). */ raw_spin_unlock_irq(&ctx->lock); return; } /* * If the task is not running, ctx->lock will avoid it becoming so, * thus we can safely install the event. */ if (task_curr(task)) { raw_spin_unlock_irq(&ctx->lock); goto again; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state <= PERF_EVENT_STATE_ERROR) return; ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; if (!event_filter_match(event)) return; /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { out: raw_spin_unlock_irq(&ctx->lock); return; } /* * If the event is in error state, clear that first. * * That way, if we see the event in error state below, we know that it * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) { /* * Detached SIBLING events cannot leave ERROR state. */ if (event->event_caps & PERF_EV_CAP_SIBLING && event->group_leader == event) goto out; event->state = PERF_EVENT_STATE_OFF; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); } /* * See perf_event_disable(); */ void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_enable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_enable); struct stop_event_data { struct perf_event *event; unsigned int restart; }; static int __perf_event_stop(void *info) { struct stop_event_data *sd = info; struct perf_event *event = sd->event; /* if it's already INACTIVE, do nothing */ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * There is a window with interrupts enabled before we get here, * so we need to check again lest we try to stop another CPU's event. */ if (READ_ONCE(event->oncpu) != smp_processor_id()) return -EAGAIN; event->pmu->stop(event, PERF_EF_UPDATE); /* * May race with the actual stop (through perf_pmu_output_stop()), * but it is only used for events with AUX ring buffer, and such * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) event->pmu->start(event, 0); return 0; } static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, .restart = restart, }; int ret = 0; do { if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * We only want to restart ACTIVE events, so if the event goes * inactive here (event->oncpu==-1), there's nothing more to do; * fall through with ret==-ENXIO. */ ret = cpu_function_call(READ_ONCE(event->oncpu), __perf_event_stop, &sd); } while (ret == -EAGAIN); return ret; } /* * In order to contain the amount of racy and tricky in the address filter * configuration management, it is a two part process: * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. * * If (p1) happens while the event is active, we restart it to force (p2). * * (1) perf_addr_filters_apply(): adjusting filters' offsets based on * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. */ void perf_event_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); if (!has_addr_filter(event)) return; raw_spin_lock(&ifh->lock); if (event->addr_filters_gen != event->hw.addr_filters_gen) { event->pmu->addr_filters_sync(event); event->hw.addr_filters_gen = event->addr_filters_gen; } raw_spin_unlock(&ifh->lock); } EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events */ if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); _perf_event_enable(event); return 0; } /* * See perf_event_disable() */ int perf_event_refresh(struct perf_event *event, int refresh) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_refresh(event, refresh); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_refresh); static int perf_event_modify_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); if (!bp->attr.disabled) _perf_event_enable(bp); return err; } /* * Copy event-type-independent attributes that may be modified. */ static void perf_event_modify_copy_attr(struct perf_event_attr *to, const struct perf_event_attr *from) { to->sig_data = from->sig_data; } static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { int (*func)(struct perf_event *, struct perf_event_attr *); struct perf_event *child; int err; if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: func = perf_event_modify_breakpoint; break; default: /* Place holder for future additions. */ return -EOPNOTSUPP; } WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); /* * Event-type-independent attributes must be copied before event-type * modification, which will validate that final attributes match the * source attributes after all relevant attributes have been copied. */ perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; } out: mutex_unlock(&event->child_mutex); return err; } static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, enum event_type_t event_type) { struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } if (!(event_type & EVENT_ALL)) return; perf_pmu_disable(pmu); if (event_type & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list) group_sched_out(event, ctx); } if (event_type & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list) group_sched_out(event, ctx); /* * Since we cleared EVENT_FLEXIBLE, also clear * rotate_necessary, is will be reset by * ctx_flexible_sched_in() when needed. */ pmu_ctx->rotate_necessary = 0; } perf_pmu_enable(pmu); } /* * Be very careful with the @pmu argument since this will change ctx state. * The @pmu argument works for ctx_resched(), because that is symmetric in * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. * * However, if you were to be asymmetrical, you could end up with messed up * state, eg. ctx->is_active cleared even though most EPCs would still actually * be active. */ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) { /* * See __perf_remove_from_context(). */ WARN_ON_ONCE(ctx->is_active); if (ctx->task) WARN_ON_ONCE(cpuctx->task_ctx); return; } /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last * context we sched out. For example: * * ctx_sched_out(.event_type = EVENT_FLEXIBLE) * ctx_sched_out(.event_type = EVENT_PINNED) * * would only update time for the pinned events. */ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); ctx->is_active &= ~event_type; if (!(ctx->is_active & EVENT_ALL)) { /* * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() * does not observe a hole. perf_ctx_unlock() will clean up. */ if (ctx->is_active & EVENT_FROZEN) ctx->is_active &= EVENT_TIME_FROZEN; else ctx->is_active = 0; } if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!(ctx->is_active & EVENT_ALL)) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); } /* * Test whether two contexts are equivalent, i.e. whether they have both been * cloned from the same version of the same context. * * Equivalence is measured using a generation number in the context that is * incremented on each modification to it; see unclone_ctx(), list_add_event() * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { lockdep_assert_held(&ctx1->lock); lockdep_assert_held(&ctx2->lock); /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; /* If ctx1 is the parent of ctx2 */ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) return 1; /* If ctx2 is the parent of ctx1 */ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) return 1; /* * If ctx1 and ctx2 have the same parent; we flatten the parent * hierarchy, see perf_event_init_context(). */ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen) return 1; /* Unmatched */ return 0; } static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { u64 value; if (!event->attr.inherit_stat) return; /* * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however * we know the event must be on the current CPU, therefore we * don't need to use it. */ if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ value = local64_read(&next_event->count); value = local64_xchg(&event->count, value); local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); /* * Since we swizzled the values, update the user visible data too. */ perf_event_update_userpage(event); perf_event_update_userpage(next_event); } static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { struct perf_event *event, *next_event; if (!ctx->nr_stat) return; update_context_time(ctx); event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); next_event = list_first_entry(&next_ctx->event_list, struct perf_event, event_entry); while (&event->event_entry != &ctx->event_list && &next_event->event_entry != &next_ctx->event_list) { __perf_event_sync_stat(event, next_event); event = list_next_entry(event, event_entry); next_event = list_next_entry(next_event, event_entry); } } #define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ pos2 = list_first_entry(head2, typeof(*pos2), member); \ !list_entry_is_head(pos1, head1, member) && \ !list_entry_is_head(pos2, head2, member); \ pos1 = list_next_entry(pos1, member), \ pos2 = list_next_entry(pos2, member)) static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, struct perf_event_context *next_ctx) { struct perf_event_pmu_context *prev_epc, *next_epc; if (!prev_ctx->nr_task_data) return; double_list_for_each_entry(prev_epc, next_epc, &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, pmu_ctx_entry) { if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) continue; /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such * synchronization see implementation details of Intel * LBR call stack data profiling; */ if (prev_epc->pmu->swap_task_ctx) prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); else swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); } } static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); } } static void perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; int do_switch = 1; if (likely(!ctx)) return; rcu_read_lock(); next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; parent = rcu_dereference(ctx->parent_ctx); next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both * contexts and check that they are clones under the * lock (including re-checking that neither has been * uncloned in the meantime). It doesn't matter which * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { perf_ctx_disable(ctx, false); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. * * Likewise, when a context contains inherit + * SAMPLE_READ events they should be switched * out using the slow path so that they are * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); goto inside_switch; } WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); perf_ctx_sched_task_cb(ctx, false); perf_event_swap_task_ctx_data(ctx, next_ctx); perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of * ctx->task and ctx->task_ctx_data are immaterial * since those values are always verified under * ctx->lock which we're now holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; perf_event_sync_stat(ctx, next_ctx); } raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } unlock: rcu_read_unlock(); if (do_switch) { raw_spin_lock(&ctx->lock); perf_ctx_disable(ctx, false); inside_switch: perf_ctx_sched_task_cb(ctx, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); this_cpu_dec(perf_sched_cb_usages); barrier(); if (!--cpc->sched_cb_usage) list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); barrier(); this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. * * This callback is relevant even to per-cpu events; for example multi event * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; pmu = cpc->epc.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); pmu->sched_task(cpc->task_epc, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cpu_pmu_context *cpc; /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ if (prev == next || cpuctx->task_ctx) return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) __perf_pmu_sched_task(cpc, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. * * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * not restart the event. */ void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args) { const struct perf_event *le = *(const struct perf_event **)l; const struct perf_event *re = *(const struct perf_event **)r; return le->group_index < re->group_index; } DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); static const struct min_heap_callbacks perf_min_heap = { .less = perf_less_group_idx, .swp = NULL, }; static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) { struct perf_event **itrs = heap->data; if (event) { itrs[heap->nr] = event; heap->nr++; } } static void __link_epc(struct perf_event_pmu_context *pmu_ctx) { struct perf_cpu_pmu_context *cpc; if (!pmu_ctx->ctx->task) return; cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct perf_event_min_heap event_heap; struct perf_event **evt; int ret; if (pmu->filter && pmu->filter(pmu, cpu)) return 0; if (!ctx->task) { cpuctx = this_cpu_ptr(&perf_cpu_context); event_heap = (struct perf_event_min_heap){ .data = cpuctx->heap, .nr = 0, .size = cpuctx->heap_size, }; lockdep_assert_held(&cpuctx->ctx.lock); #ifdef CONFIG_CGROUP_PERF if (cpuctx->cgrp) css = &cpuctx->cgrp->css; #endif } else { event_heap = (struct perf_event_min_heap){ .data = itrs, .nr = 0, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif if (event_heap.nr) { __link_epc((*evt)->pmu_ctx); perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); while (event_heap.nr) { ret = func(*evt, data); if (ret) return ret; *evt = perf_event_groups_next(*evt, pmu); if (*evt) min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); else min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); } return 0; } /* * Because the userpage is strictly per-event (there is no concept of context, * so there cannot be a context indirection), every userpage must be updated * when context time starts :-( * * IOW, we must not miss EVENT_TIME edges. */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); perf_event_update_userpage(event); return true; } static inline void group_update_userpage(struct perf_event *group_event) { struct perf_event *event; if (!event_update_userpage(group_event)) return; for_each_sibling_event(event, group_event) event_update_userpage(event); } static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; if (!event_filter_match(event)) return 0; if (group_can_go_on(event, *can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { struct perf_cpu_pmu_context *cpc; event->pmu_ctx->rotate_necessary = 1; cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } } return 0; } static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, struct pmu *pmu) { int can_add_hw = 1; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, merge_sched_in, &can_add_hw); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, enum event_type_t event_type) { struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); if (event_type & EVENT_FLEXIBLE) pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; if (!(is_active & EVENT_TIME)) { /* start ctx time */ __update_context_time(ctx, false); perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); } is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); } } static void perf_event_context_sched_in(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto rcu_unlock; if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx, false); perf_ctx_sched_task_cb(ctx, true); perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); /* * We must check ctx->nr_events while holding ctx->lock, such * that we serialize against perf_install_in_context(). */ if (!ctx->nr_events) goto unlock; perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. * * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } perf_event_sched_in(cpuctx, ctx, NULL); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx, false); perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); rcu_unlock: rcu_read_unlock(); } /* * Called from scheduler to add the events of the current task * with interrupts disabled. * * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * keep the event running. */ void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; u64 sec = NSEC_PER_SEC; u64 divisor, dividend; int count_fls, nsec_fls, frequency_fls, sec_fls; count_fls = fls64(count); nsec_fls = fls64(nsec); frequency_fls = fls64(frequency); sec_fls = 30; /* * We got @count in @nsec, with a target of sample_freq HZ * the target period becomes: * * @count * 10^9 * period = ------------------- * @nsec * sample_freq * */ /* * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ #define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ a##_fls--; \ } else { \ b >>= 1; \ b##_fls--; \ } \ } while (0) /* * Reduce accuracy until either term fits in a u64, then proceed with * the other, so that finally we can do a u64/u64 division. */ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); REDUCE_FLS(sec, count); } if (count_fls + sec_fls > 64) { divisor = nsec * frequency; while (count_fls + sec_fls > 64) { REDUCE_FLS(count, sec); divisor >>= 1; } dividend = count * sec; } else { dividend = count * sec; while (nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); dividend >>= 1; } divisor = nsec * frequency; } if (!divisor) return dividend; return div64_u64(dividend, divisor); } static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); if (delta >= 0) delta += 7; else delta -= 7; delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; if (!sample_period) sample_period = 1; hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { if (disable) event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); if (disable) event->pmu->start(event, PERF_EF_RELOAD); } } static void perf_adjust_freq_unthr_events(struct list_head *event_list) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; list_for_each_entry(event, event_list, active_list) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); if (!event->attr.freq || !event->attr.sample_freq) event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) continue; /* * stop the event and update event->count */ event->pmu->stop(event, PERF_EF_UPDATE); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; /* * restart the event * reload only if value has changed * we have stopped the event so tell that * to perf_adjust_period() to avoid stopping it * twice. */ if (delta > 0) perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } } /* * combine freq adjustment with unthrottling to avoid two passes over the * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) { struct perf_event_pmu_context *pmu_ctx; /* * only need to iterate over all events iff: * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ if (!(ctx->nr_freq || unthrottle)) return; raw_spin_lock(&ctx->lock); list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (!(pmu_ctx->nr_freq || unthrottle)) continue; if (!perf_pmu_ctx_is_active(pmu_ctx)) continue; if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) continue; perf_pmu_disable(pmu_ctx->pmu); perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); perf_pmu_enable(pmu_ctx->pmu); } raw_spin_unlock(&ctx->lock); } /* * Move @event to the tail of the @ctx's elegible events. */ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (ctx->rotate_disable) return; perf_event_groups_delete(&ctx->flexible_groups, event); perf_event_groups_insert(&ctx->flexible_groups, event); } /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; struct rb_node *node; struct rb_root *tree; struct __group_key key = { .pmu = pmu_ctx->pmu, }; /* pick the first active flexible event */ event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); if (event) goto out; /* if no active flexible event, pick the first event */ tree = &pmu_ctx->ctx->flexible_groups.tree; if (!pmu_ctx->ctx->task) { key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); goto out; } key.cpu = -1; node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) { event = __node_2_pe(node); goto out; } key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ pmu_ctx->rotate_necessary = 0; return event; } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; int cpu_rotate, task_rotate; struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ cpu_epc = &cpc->epc; pmu = cpu_epc->pmu; task_epc = cpc->task_epc; cpu_rotate = cpu_epc->rotate_necessary; task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); if (task_rotate) task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate) cpu_event = ctx_event_to_rotate(cpu_epc); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ if (task_event || (task_epc && cpu_event)) { update_context_time(task_epc->ctx); __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); } if (cpu_event) { update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; } void perf_event_task_tick(void) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; int throttled; lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); rcu_read_lock(); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_adjust_freq_unthr_context(ctx, !!throttled); rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, struct perf_event_context *ctx) { if (!event->attr.enable_on_exec) return 0; event->attr.enable_on_exec = 0; if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ static void perf_event_enable_on_exec(struct perf_event_context *ctx) { struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; local_irq_save(flags); if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) goto out; if (!ctx->nr_events) goto out; cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); ctx_time_freeze(cpuctx, ctx); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, NULL, event_type); } perf_ctx_unlock(cpuctx, ctx); out: local_irq_restore(flags); if (clone_ctx) put_ctx(clone_ctx); } static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx); /* * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events. */ static void perf_event_remove_on_exec(struct perf_event_context *ctx) { struct perf_event_context *clone_ctx = NULL; struct perf_event *event, *next; unsigned long flags; bool modified = false; mutex_lock(&ctx->mutex); if (WARN_ON_ONCE(ctx->task != current)) goto unlock; list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { if (!event->attr.remove_on_exec) continue; if (!is_kernel_event(event)) perf_remove_from_owner(event); modified = true; perf_event_exit_event(event, ctx); } raw_spin_lock_irqsave(&ctx->lock, flags); if (modified) clone_ctx = unclone_ctx(ctx); raw_spin_unlock_irqrestore(&ctx->lock, flags); unlock: mutex_unlock(&ctx->mutex); if (clone_ctx) put_ctx(clone_ctx); } struct perf_read_data { struct perf_event *event; bool group; int ret; }; static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { int local_cpu = smp_processor_id(); u16 local_pkg, event_pkg; if ((unsigned)event_cpu >= nr_cpu_ids) return event_cpu; if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) return local_cpu; } if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); if (event_pkg == local_pkg) return local_cpu; } return event_cpu; } /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. In that case * event->count would have been updated to a recent sample * when the event was scheduled out. */ if (ctx->task && cpuctx->task_ctx != ctx) return; raw_spin_lock(&ctx->lock); ctx_time_update_event(ctx, event); perf_event_update_time(event); if (data->group) perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; if (!data->group) { pmu->read(event); data->ret = 0; goto unlock; } pmu->start_txn(pmu, PERF_PMU_TXN_READ); pmu->read(event); for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since * sibling could be on different (eg: software) PMU. */ sub->pmu->read(sub); } } data->ret = pmu->commit_txn(pmu); unlock: raw_spin_unlock(&ctx->lock); } static inline u64 perf_event_count(struct perf_event *event, bool self) { if (self) return local64_read(&event->count); return local64_read(&event->count) + atomic64_read(&event->child_count); } static void calc_timer_values(struct perf_event *event, u64 *now, u64 *enabled, u64 *running) { u64 ctx_time; *now = perf_clock(); ctx_time = perf_event_time_now(event, *now); __perf_update_times(event, ctx_time, enabled, running); } /* * NMI-safe method to read a local event, that is an event that * is: * - either for the current task, or for this CPU * - does not have inherit set, for inherited task events * will not be local and we cannot read them atomically * - must not have a pmu::count method */ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; int event_oncpu; int event_cpu; int ret = 0; /* * Disabling interrupts avoids all counter scheduling (context * switches, timer based rotation and IPIs). */ local_irq_save(flags); /* * It must not be an event with inherit set, we cannot read * all child counters from atomic context. */ if (event->attr.inherit) { ret = -EOPNOTSUPP; goto out; } /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { ret = -EINVAL; goto out; } /* * Get the event CPU numbers, and adjust them to local if the event is * a per-package event that can be read locally */ event_oncpu = __perf_event_read_cpu(event, event->oncpu); event_cpu = __perf_event_read_cpu(event, event->cpu); /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && event_cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ if (event->attr.pinned && event_oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ if (event_oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); if (enabled || running) { u64 __enabled, __running, __now; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) *running = __running; } out: local_irq_restore(flags); return ret; } static int perf_event_read(struct perf_event *event, bool group) { enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ again: if (state == PERF_EVENT_STATE_ACTIVE) { struct perf_read_data data; /* * Orders the ->state and ->oncpu loads such that if we see * ACTIVE we must also see the right ->oncpu. * * Matches the smp_wmb() from event_sched_in(). */ smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; data = (struct perf_read_data){ .event = event, .group = group, .ret = 0, }; preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); /* * Purposely ignore the smp_call_function_single() return * value. * * If event_cpu isn't a valid CPU it means the event got * scheduled out and that will have updated the event count. * * Therefore, either way, we'll have an up-to-date event count * after this. */ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); state = event->state; if (state != PERF_EVENT_STATE_INACTIVE) { raw_spin_unlock_irqrestore(&ctx->lock, flags); goto again; } /* * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ ctx_time_update_event(ctx, event); perf_event_update_time(event); if (group) perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ret; } /* * Initialize the perf_event context in a task_struct: */ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->pmu_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); refcount_set(&ctx->refcount, 1); } static void __perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) { epc->pmu = pmu; INIT_LIST_HEAD(&epc->pmu_ctx_entry); INIT_LIST_HEAD(&epc->pinned_active); INIT_LIST_HEAD(&epc->flexible_active); atomic_set(&epc->refcount, 1); } static struct perf_event_context * alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!ctx) return NULL; __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); return ctx; } static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; rcu_read_lock(); if (!vpid) task = current; else task = find_task_by_vpid(vpid); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) return ERR_PTR(-ESRCH); return task; } /* * Returns a matching context with refcount and pincount. */ static struct perf_event_context * find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; unsigned long flags; int err; if (!task) { /* Must be root to operate on a CPU event: */ err = perf_allow_cpu(&event->attr); if (err) return ERR_PTR(err); cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } err = -EINVAL; retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; err = 0; mutex_lock(&task->perf_event_mutex); /* * If it has already passed perf_event_exit_task(). * we must see PF_EXITING, it takes this mutex too. */ if (task->flags & PF_EXITING) err = -ESRCH; else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { put_ctx(ctx); if (err == -EAGAIN) goto retry; goto errout; } } return ctx; errout: return ERR_PTR(err); } static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { struct perf_event_pmu_context *new = NULL, *epc; void *task_ctx_data = NULL; if (!ctx->task) { /* * perf_pmu_migrate_context() / __perf_pmu_install_event() * relies on the fact that find_get_pmu_context() cannot fail * for CPU contexts. */ struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } raw_spin_unlock_irq(&ctx->lock); return epc; } new = kzalloc(sizeof(*epc), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); if (event->attach_state & PERF_ATTACH_TASK_DATA) { task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { kfree(new); return ERR_PTR(-ENOMEM); } } __perf_init_event_pmu_context(new, pmu); /* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because perf_event_init_task() doesn't actually hold the * child_ctx->mutex. */ raw_spin_lock_irq(&ctx->lock); list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (epc->pmu == pmu) { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); goto found_epc; } } epc = new; new = NULL; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; found_epc: if (task_ctx_data && !epc->task_ctx_data) { epc->task_ctx_data = task_ctx_data; task_ctx_data = NULL; ctx->nr_task_data++; } raw_spin_unlock_irq(&ctx->lock); free_task_ctx_data(pmu, task_ctx_data); kfree(new); return epc; } static void get_pmu_ctx(struct perf_event_pmu_context *epc) { WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); kfree(epc->task_ctx_data); kfree(epc); } static void put_pmu_ctx(struct perf_event_pmu_context *epc) { struct perf_event_context *ctx = epc->ctx; unsigned long flags; /* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because of the call-site in _free_event()/put_event() * which isn't always called under ctx->mutex. */ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); list_del_init(&epc->pmu_ctx_entry); epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); raw_spin_unlock_irqrestore(&ctx->lock, flags); if (epc->embedded) return; call_rcu(&epc->rcu_head, free_epc_rcu); } static void perf_event_free_filter(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { struct perf_event *event = container_of(head, typeof(*event), rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); kmem_cache_free(perf_event_cache, event); } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb); static void detach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_del_rcu(&event->sb_list); raw_spin_unlock(&pel->lock); } static bool is_sb_event(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; if (event->parent) return false; if (event->attach_state & PERF_ATTACH_TASK) return false; if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; } static void unaccount_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) detach_sb_event(event); } #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif static void unaccount_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL spin_lock(&nr_freq_lock); if (atomic_dec_and_test(&nr_freq_events)) tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void unaccount_freq_event(void) { if (tick_nohz_full_enabled()) unaccount_freq_event_nohz(); else atomic_dec(&nr_freq_events); } static void unaccount_event(struct perf_event *event) { bool dec = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.build_id) atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); if (event->attr.cgroup) atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) unaccount_freq_event(); if (event->attr.context_switch) { dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) dec = true; if (has_branch_stack(event)) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); if (event->attr.text_poke) atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) schedule_delayed_work(&perf_sched_work, HZ); } unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) { mutex_lock(&perf_sched_mutex); if (atomic_dec_and_test(&perf_sched_count)) static_branch_disable(&perf_sched_events); mutex_unlock(&perf_sched_mutex); } /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled * at a time, so we disallow creating events that might conflict, namely: * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return 0; /* * Prevent co-existence of per-task and cpu-wide events on the * same exclusive pmu. * * Negative pmu::exclusive_cnt means there are cpu-wide * events on this "exclusive" pmu, positive means there are * per-task events. * * Since this is called in perf_event_alloc() path, event::ctx * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK * to mean "per-task event", because unlike other attach states it * never gets cleared. */ if (event->attach_state & PERF_ATTACH_TASK) { if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) return -EBUSY; } else { if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) return -EBUSY; } return 0; } static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) return true; return false; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; lockdep_assert_held(&ctx->mutex); if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { if (exclusive_event_match(iter_event, event)) return false; } return true; } static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); static void perf_pending_task_sync(struct perf_event *event) { struct callback_head *head = &event->pending_task; if (!event->pending_work) return; /* * If the task is queued to the current task's queue, we * obviously can't wait for it to complete. Simply cancel it. */ if (task_work_cancel(current, head)) { event->pending_work = 0; local_dec(&event->ctx->nr_no_switch_fast); return; } /* * All accesses related to the event are within the same RCU section in * perf_pending_task(). The RCU grace period before the event is freed * will make sure all those accesses are complete by then. */ rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); } static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); perf_pending_task_sync(event); unaccount_event(event); security_perf_event_free(event); if (event->rb) { /* * Can happen when we close an event with re-directed output. * * Since we have a 0 refcount, perf_mmap_close() will skip * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) perf_detach_cgroup(event); if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); /* * Must be after ->destroy(), due to uprobe_perf_close() using * hw.target. */ if (event->hw.target) put_task_struct(event->hw.target); if (event->pmu_ctx) put_pmu_ctx(event->pmu_ctx); /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); exclusive_event_destroy(event); module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, "unexpected event refcount: %ld; ptr=%p\n", atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } _free_event(event); } /* * Remove user event from the owner task. */ static void perf_remove_from_owner(struct perf_event *event) { struct task_struct *owner; rcu_read_lock(); /* * Matches the smp_store_release() in perf_event_exit_task(). If we * observe !owner it means the list deletion is complete and we can * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last * task reference we can safely take a new reference * while holding the rcu_read_lock(). */ get_task_struct(owner); } rcu_read_unlock(); if (owner) { /* * If we're here through perf_event_exit_task() we're already * holding ctx->mutex which would be an inversion wrt. the * normal lock order. * * However we can safely take this lock because its the child * ctx->mutex. */ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex * ensured they're done, and we can proceed with freeing the * event. */ if (event->owner) { list_del_init(&event->owner_entry); smp_store_release(&event->owner, NULL); } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } } static void put_event(struct perf_event *event) { if (!atomic_long_dec_and_test(&event->refcount)) return; _free_event(event); } /* * Kill an event dead; while event:refcount will preserve the event * object, it will not preserve its functionality. Once the last 'user' * gives up the object, we'll destroy the thing. */ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); goto no_ctx; } if (!is_kernel_event(event)) perf_remove_from_owner(event); ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ * also see this, most importantly inherit_event() which will avoid * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { void *var = NULL; /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. * * Since the event cannot get freed while we hold the * child_mutex, the context must also exist and have a !0 * reference count. */ get_ctx(ctx); /* * Now that we have a ctx ref, we can drop child_mutex, and * acquire ctx::mutex without fear of it going away. Then we * can re-acquire child_mutex. */ mutex_unlock(&event->child_mutex); mutex_lock(&ctx->mutex); mutex_lock(&event->child_mutex); /* * Now that we hold ctx::mutex and child_mutex, revalidate our * state, if child is still the first entry, it didn't get freed * and we can continue doing so. */ tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. */ put_event(event); } else { var = &ctx->refcount; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); if (var) { /* * If perf_event_free_task() has deleted all events from the * ctx while the child_mutex got released above, make sure to * notify about the preceding put_ctx(). */ smp_mb(); /* pairs with wait_var_event() */ wake_up_var(var); } goto again; } mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { void *var = &child->ctx->refcount; list_del(&child->child_list); free_event(child); /* * Wake any perf_event_free_task() waiting for this event to be * freed. */ smp_mb(); /* pairs with wait_var_event() */ wake_up_var(var); } no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static int perf_release(struct inode *inode, struct file *file) { perf_event_release_kernel(file->private_data); return 0; } static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; *enabled = 0; *running = 0; mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); total += perf_event_count(event, false); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } mutex_unlock(&event->child_mutex); return total; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); count = __perf_event_read_value(event, enabled, running); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; ret = perf_event_read(leader, true); if (ret) return ret; raw_spin_lock_irqsave(&ctx->lock, flags); /* * Verify the grouping between the parent and child (inherited) * events is still in tact. * * Specifically: * - leader->ctx->lock pins leader->sibling_list * - parent->child_mutex pins parent->child_list * - parent->ctx->mutex pins parent->sibling_list * * Because parent->ctx != leader->ctx (and child_list nests inside * ctx->mutex), group destruction is not atomic between children, also * see perf_event_release_kernel(). Additionally, parent can grow the * group. * * Therefore it is possible to have parent and child groups in a * different configuration and summing over such a beast makes no sense * what so ever. * * Reject this. */ parent = leader->parent; if (parent && (parent->group_generation != leader->group_generation || parent->nr_siblings != leader->nr_siblings)) { ret = -ECHILD; goto unlock; } /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one * set. */ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] += leader->total_time_enabled + atomic64_read(&leader->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] += leader->total_time_running + atomic64_read(&leader->child_total_time_running); } /* * Write {count,id} tuples for every sibling. */ values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); } unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); return ret; } static int perf_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *child; struct perf_event_context *ctx = leader->ctx; int ret; u64 *values; lockdep_assert_held(&ctx->mutex); values = kzalloc(event->read_size, GFP_KERNEL); if (!values) return -ENOMEM; values[0] = 1 + leader->nr_siblings; mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); if (ret) goto unlock; list_for_each_entry(child, &leader->child_list, child_list) { ret = __perf_read_group_add(child, read_format, values); if (ret) goto unlock; } mutex_unlock(&leader->child_mutex); ret = event->read_size; if (copy_to_user(buf, values, event->read_size)) ret = -EFAULT; goto out; unlock: mutex_unlock(&leader->child_mutex); out: kfree(values); return ret; } static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; return n * sizeof(u64); } static bool is_event_hup(struct perf_event *event) { bool no_children; if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); no_children = list_empty(&event->child_list); mutex_unlock(&event->child_mutex); return no_children; } /* * Read the performance event - simple non blocking version for now */ static ssize_t __perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; /* * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ if (event->state == PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) ret = perf_read_group(event, read_format, buf); else ret = perf_read_one(event, read_format, buf); return ret; } static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; int ret; ret = security_perf_event_read(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; } static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct perf_buffer *rb; __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); if (is_event_hup(event)) return events; /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); rb = event->rb; if (rb) events = atomic_xchg(&rb->poll, 0); mutex_unlock(&event->mmap_mutex); return events; } static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } /* Assume it's not an event with inherit set. */ u64 perf_event_pause(struct perf_event *event, bool reset) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(event->attr.inherit); _perf_event_disable(event); count = local64_read(&event->count); if (reset) local64_set(&event->count, 0); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_pause); /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) func(child); mutex_unlock(&event->child_mutex); } static void perf_event_for_each(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; lockdep_assert_held(&ctx->mutex); event = event->group_leader; perf_event_for_each_child(event, func); for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } static void __perf_event_period(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { u64 value = *((u64 *)info); bool active; if (event->attr.freq) { event->attr.sample_freq = value; } else { event->attr.sample_period = value; event->hw.sample_period = value; } active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(event->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. */ if (event->hw.interrupts == MAX_INTERRUPTS) { event->hw.interrupts = 0; perf_log_throttle(event, 1); } event->pmu->stop(event, PERF_EF_UPDATE); } local64_set(&event->hw.period_left, 0); if (active) { event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(event->pmu); } } static int perf_event_check_period(struct perf_event *event, u64 value) { return event->pmu->check_period(event, value); } static int _perf_event_period(struct perf_event *event, u64 value) { if (!is_sampling_event(event)) return -EINVAL; if (!value) return -EINVAL; if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; if (perf_event_check_period(event, value)) return -EINVAL; if (!event->attr.freq && (value & (1ULL << 63))) return -EINVAL; event_function_call(event, __perf_event_period, &value); return 0; } int perf_event_period(struct perf_event *event, u64 value) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_period(event, value); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; static inline bool is_perf_file(struct fd f) { return !fd_empty(f) && fd_file(f)->f_op == &perf_fops; } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: { u64 value; if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) return -EFAULT; return _perf_event_period(event, value); } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); if (copy_to_user((void __user *)arg, &id, sizeof(id))) return -EFAULT; return 0; } case PERF_EVENT_IOC_SET_OUTPUT: { CLASS(fd, output)(arg); // arg == -1 => empty struct perf_event *output_event = NULL; if (arg != -1) { if (!is_perf_file(output)) return -EBADF; output_event = fd_file(output)->private_data; } return perf_event_set_output(event, output_event); } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: { struct bpf_prog *prog; int err; prog = bpf_prog_get(arg); if (IS_ERR(prog)) return PTR_ERR(prog); err = perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; } return 0; } case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb || !rb->nr_pages) { rcu_read_unlock(); return -EINVAL; } rb_toggle_paused(rb, !!arg); rcu_read_unlock(); return 0; } case PERF_EVENT_IOC_QUERY_BPF: return perf_event_query_prog_array(event, (void __user *)arg); case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { struct perf_event_attr new_attr; int err = perf_copy_attr((struct perf_event_attr __user *)arg, &new_attr); if (err) return err; return perf_event_modify_attr(event, &new_attr); } default: return -ENOTTY; } if (flags & PERF_IOC_FLAG_GROUP) perf_event_for_each(event, func); else perf_event_for_each_child(event, func); return 0; } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; long ret; /* Treat ioctl like writes as it is likely a mutating operation. */ ret = security_perf_event_write(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); return ret; } #ifdef CONFIG_COMPAT static long perf_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; cmd |= sizeof(void *) << IOCSIZE_SHIFT; } break; } return perf_ioctl(file, cmd, arg); } #else # define perf_compat_ioctl NULL #endif int perf_event_task_enable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(&current->perf_event_mutex); list_for_each_entry(event, &current->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_enable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(&current->perf_event_mutex); return 0; } int perf_event_task_disable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(&current->perf_event_mutex); list_for_each_entry(event, &current->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_disable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(&current->perf_event_mutex); return 0; } static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) return 0; if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; return event->pmu->event_idx(event); } static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; userpg = rb->user_page; /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); userpg->data_offset = PAGE_SIZE; userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); } void __weak arch_perf_update_userpage( struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { } /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch * code calls this from NMI context. */ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; u64 enabled, running, now; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we can be called in * NMI context */ calc_timer_values(event, &now, &enabled, &running); userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page. */ preempt_disable(); ++userpg->lock; barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + atomic64_read(&event->child_total_time_enabled); userpg->time_running = running + atomic64_read(&event->child_total_time_running); arch_perf_update_userpage(event, userpg, now); barrier(); ++userpg->lock; preempt_enable(); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(perf_event_update_userpage); static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; struct perf_buffer *rb; vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { if (vmf->pgoff == 0) ret = 0; return ret; } rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; vmf->page = perf_mmap_to_page(rb, vmf->pgoff); if (!vmf->page) goto unlock; get_page(vmf->page); vmf->page->mapping = vmf->vma->vm_file->f_mapping; vmf->page->index = vmf->pgoff; ret = 0; unlock: rcu_read_unlock(); return ret; } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { struct perf_buffer *old_rb = NULL; unsigned long flags; WARN_ON_ONCE(event->parent); if (event->rb) { /* * Should be impossible, we set this when removing * event->rb_entry and wait/clear when adding event->rb_entry. */ WARN_ON_ONCE(event->rcu_pending); old_rb = event->rb; spin_lock_irqsave(&old_rb->event_lock, flags); list_del_rcu(&event->rb_entry); spin_unlock_irqrestore(&old_rb->event_lock, flags); event->rcu_batches = get_state_synchronize_rcu(); event->rcu_pending = 1; } if (rb) { if (event->rcu_pending) { cond_synchronize_rcu(event->rcu_batches); event->rcu_pending = 0; } spin_lock_irqsave(&rb->event_lock, flags); list_add_rcu(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } /* * Avoid racing with perf_mmap_close(AUX): stop the event * before swizzling the event::rb pointer; if it's getting * unmapped, its aux_mmap_count will be 0 and it won't * restart. See the comment in __perf_pmu_output_stop(). * * Data will inevitably be lost when set_output is done in * mid-air, but then again, whoever does it like this is * not in for the data anyway. */ if (has_aux(event)) perf_event_stop(event, 0); rcu_assign_pointer(event->rb, rb); if (old_rb) { ring_buffer_put(old_rb); /* * Since we detached before setting the new rb, so that we * could attach the new rb, we could have missed a wakeup. * Provide it now. */ wake_up_all(&event->waitq); } } static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { list_for_each_entry_rcu(event, &rb->event_list, rb_entry) wake_up_all(&event->waitq); } rcu_read_unlock(); } struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); return rb; } void ring_buffer_put(struct perf_buffer *rb) { if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). * * In order to undo the VM accounting done by perf_mmap() we need to destroy * the buffer here, where we still have a VM context. This means we need * to detach all events redirecting to us. */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); /* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU * data. Note that after rb::aux_mmap_count dropped to zero, * they won't start any more (see perf_aux_output_begin()). */ perf_pmu_output_stop(event); /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&rb->aux_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ if (!detach_rest) goto out_put; /* * No other mmap()s, detach from all other events that might redirect * into the now unreachable buffer. Somewhat complicated by the * fact that rb::event_lock otherwise nests inside mmap_mutex. */ again: rcu_read_lock(); list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { if (!atomic_long_inc_not_zero(&event->refcount)) { /* * This event is en-route to free_event() which will * detach it and remove it from the list. */ continue; } rcu_read_unlock(); mutex_lock(&event->mmap_mutex); /* * Check we didn't race with perf_event_set_output() which can * swizzle the rb from under us while we were waiting to * acquire mmap_mutex. * * If we find a different rb; ignore this event, a next * iteration will no longer find it on the list. We have to * still restart the iteration to make sure we're not now * iterating the wrong list. */ if (event->rb == rb) ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); put_event(event); /* * Restart the iteration; either we're on the wrong list or * destroyed its integrity by doing a deletion. */ goto again; } rcu_read_unlock(); /* * It could be there's still a few 0-ref events on the list; they'll * get cleaned up by free_event() -- they'll also still have their * ref on the rb and will free it whenever they are done with it. * * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the * same rb. */ if (event->cpu == -1 && event->attr.inherit) return -EINVAL; if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; ret = security_perf_event_read(event); if (ret) return ret; vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { nr_pages = (vma_size / PAGE_SIZE) - 1; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already * mapped, all subsequent mappings should have the same size * and offset. Must be above the normal perf buffer. */ u64 aux_offset, aux_size; if (!event->rb) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; if (nr_pages > INT_MAX) return -ENOMEM; mutex_lock(&event->mmap_mutex); ret = -EINVAL; rb = event->rb; if (!rb) goto aux_unlock; aux_mutex = &rb->aux_mutex; mutex_lock(aux_mutex); aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) goto aux_unlock; /* already mapped with a different offset */ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) goto aux_unlock; if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) goto aux_unlock; /* already mapped with a different size */ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) goto aux_unlock; if (!is_power_of_2(nr_pages)) goto aux_unlock; if (!atomic_inc_not_zero(&rb->mmap_count)) goto aux_unlock; if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; goto unlock; } atomic_set(&rb->aux_mmap_count, 1); user_extra = nr_pages; goto accounting; } /* * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); if (event->rb) { if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* * Raced against perf_mmap_close(); remove the * event and try again. */ ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } goto unlock; } user_extra = nr_pages + 1; accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* * Increase the limit linearly with more CPUs: */ user_lock_limit *= num_online_cpus(); user_locked = atomic_long_read(&user->locked_vm); /* * sysctl_perf_event_mlock may have changed, so that * user->locked_vm > user_lock_limit */ if (user_locked > user_lock_limit) user_locked = user_lock_limit; user_locked += user_extra; if (user_locked > user_lock_limit) { /* * charge locked_vm until it hits user_lock_limit; * charge the rest from pinned_vm */ extra = user_locked - user_lock_limit; user_extra -= extra; } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; } WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; if (!rb) { rb = rb_alloc(nr_pages, event->attr.watermark ? event->attr.wakeup_watermark : 0, event->cpu, flags); if (!rb) { ret = -ENOMEM; goto unlock; } atomic_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); perf_event_update_time(event); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); if (!ret) rb->aux_mmap_locked = extra; } unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { atomic_dec(&rb->mmap_count); } aux_unlock: if (aux_mutex) mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; } static int perf_fasync(int fd, struct file *filp, int on) { struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); if (retval < 0) return retval; return 0; } static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; /* * Perf event wakeup * * If there's data, ensure we set the poll() state and publish everything * to user-space before waking everybody up. */ void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } static void perf_sigtrap(struct perf_event *event) { /* * We'd expect this to only occur if the irq_work is delayed and either * ctx->task or current has changed in the meantime. This can be the * case on architectures that do not implement arch_irq_work_raise(). */ if (WARN_ON_ONCE(event->ctx->task != current)) return; /* * Both perf_pending_task() and perf_pending_irq() can race with the * task exiting. */ if (current->flags & PF_EXITING) return; send_sig_perf((void __user *)event->pending_addr, event->orig_type, event->attr.sig_data); } /* * Deliver the pending work in-event-context or follow the context. */ static void __perf_pending_disable(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); /* * If the event isn't running; we done. event_sched_out() will have * taken care of things. */ if (cpu < 0) return; /* * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); } return; } /* * CPU-A CPU-B * * perf_event_disable_inatomic() * @pending_disable = CPU-A; * irq_work_queue(); * * sched-out * @pending_disable = -1; * * sched-in * perf_event_disable_inatomic() * @pending_disable = CPU-B; * irq_work_queue(); // FAILS * * irq_work_run() * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ irq_work_queue_on(&event->pending_disable_irq, cpu); } static void perf_pending_disable(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); int rctx; /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ rctx = perf_swevent_get_recursion_context(); __perf_pending_disable(event); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } static void perf_pending_irq(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ rctx = perf_swevent_get_recursion_context(); /* * The wakeup isn't bound to the context of the event -- it can happen * irrespective of where the event is. */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } static void perf_pending_task(struct callback_head *head) { struct perf_event *event = container_of(head, struct perf_event, pending_task); int rctx; /* * All accesses to the event must belong to the same implicit RCU read-side * critical section as the ->pending_work reset. See comment in * perf_pending_task_sync(). */ rcu_read_lock(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ rctx = perf_swevent_get_recursion_context(); if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_no_switch_fast); rcuwait_wake_up(&event->pending_work_wait); } rcu_read_unlock(); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) return; rcu_assign_pointer(perf_guest_cbs, cbs); static_call_update(__perf_guest_state, cbs->state); static_call_update(__perf_guest_get_ip, cbs->get_ip); /* Implementing ->handle_intel_pt_intr is optional. */ if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) return; rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif static bool should_sample_guest(struct perf_event *event) { return !event->attr.exclude_guest && perf_guest_state(); } unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs) { if (should_sample_guest(event)) return perf_arch_guest_misc_flags(regs); return perf_arch_misc_flags(regs); } unsigned long perf_instruction_pointer(struct perf_event *event, struct pt_regs *regs) { if (should_sample_guest(event)) return perf_guest_get_ip(); return perf_arch_instruction_pointer(regs); } static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; DECLARE_BITMAP(_mask, 64); bitmap_from_u64(_mask, mask); for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); perf_output_put(handle, val); } } static void perf_sample_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; } } static void perf_sample_regs_intr(struct perf_regs *regs_intr, struct pt_regs *regs) { regs_intr->regs = regs; regs_intr->abi = perf_reg_abi(current); } /* * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) { unsigned long addr = perf_user_stack_pointer(regs); if (!addr || addr >= TASK_SIZE) return 0; return TASK_SIZE - addr; } static u16 perf_sample_ustack_size(u16 stack_size, u16 header_size, struct pt_regs *regs) { u64 task_size; /* No regs, no stack pointer, no dump. */ if (!regs) return 0; /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE * If we don't, we limit the size to the TASK_SIZE. * * - remaining sample size * If we don't, we customize the stack size to * fit in to the remaining sample size. */ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); stack_size = min(stack_size, (u16) task_size); /* Current header size plus static size and dynamic size. */ header_size += 2 * sizeof(u64); /* Do we fit in with the current stack dump size? */ if ((u16) (header_size + stack_size) < header_size) { /* * If we overflow the maximum size for the sample, * we customize the stack dump size to fit in. */ stack_size = USHRT_MAX - header_size - sizeof(u64); stack_size = round_up(stack_size, sizeof(u64)); } return stack_size; } static void perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, struct pt_regs *regs) { /* Case of a kernel thread, nothing to dump */ if (!regs) { u64 size = 0; perf_output_put(handle, size); } else { unsigned long sp; unsigned int rem; u64 dyn_size; /* * We dump: * static size * - the size requested by user or the best one we can fit * in to the sample max size * data * - user stack dump data * dynamic size * - the actual dumped size */ /* Static size. */ perf_output_put(handle, dump_size); /* Data. */ sp = perf_user_stack_pointer(regs); rem = __output_copy_user(handle, (void *) sp, dump_size); dyn_size = dump_size - rem; perf_output_skip(handle, rem); /* Dynamic size. */ perf_output_put(handle, dyn_size); } } static unsigned long perf_prepare_sample_aux(struct perf_event *event, struct perf_sample_data *data, size_t size) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; data->aux_size = 0; if (!sampler) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; rb = ring_buffer_get(sampler); if (!rb) goto out; /* * If this is an NMI hit inside sampling code, don't take * the sample. See also perf_aux_sample_output(). */ if (READ_ONCE(rb->aux_in_sampling)) { data->aux_size = 0; } else { size = min_t(size_t, size, perf_aux_size(rb)); data->aux_size = ALIGN(size, sizeof(u64)); } ring_buffer_put(rb); out: return data->aux_size; } static long perf_pmu_snapshot_aux(struct perf_buffer *rb, struct perf_event *event, struct perf_output_handle *handle, unsigned long size) { unsigned long flags; long ret; /* * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler * paths. If we start calling them in NMI context, they may race with * the IRQ ones, that is, for example, re-starting an event that's just * been stopped, which is why we're using a separate callback that * doesn't change the event state. * * IRQs need to be disabled to prevent IPIs from racing with us. */ local_irq_save(flags); /* * Guard against NMI hits inside the critical section; * see also perf_prepare_sample_aux(). */ WRITE_ONCE(rb->aux_in_sampling, 1); barrier(); ret = event->pmu->snapshot_aux(event, handle, size); barrier(); WRITE_ONCE(rb->aux_in_sampling, 0); local_irq_restore(flags); return ret; } static void perf_aux_sample_output(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *data) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; unsigned long pad; long size; if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; rb = ring_buffer_get(sampler); if (!rb) return; size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); /* * An error here means that perf_output_copy() failed (returned a * non-zero surplus that it didn't copy), which in its current * enlightened implementation is not possible. If that changes, we'd * like to know. */ if (WARN_ON_ONCE(size < 0)) goto out_put; /* * The pad comes from ALIGN()ing data->aux_size up to u64 in * perf_prepare_sample_aux(), so should not be more than that. */ pad = data->aux_size - size; if (WARN_ON_ONCE(pad >= sizeof(u64))) pad = 8; if (pad) { u64 zero = 0; perf_output_copy(handle, &zero, pad); } out_put: ring_buffer_put(rb); } /* * A set of common sample data types saved even for non-sample records * when event->attr.sample_id_all is set. */ #define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) static void __perf_event_header__init_id(struct perf_sample_data *data, struct perf_event *event, u64 sample_type) { data->type = event->attr.sample_type; data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ data->tid_entry.pid = perf_event_pid(event, current); data->tid_entry.tid = perf_event_tid(event, current); } if (sample_type & PERF_SAMPLE_TIME) data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); if (sample_type & PERF_SAMPLE_STREAM_ID) data->stream_id = event->id; if (sample_type & PERF_SAMPLE_CPU) { data->cpu_entry.cpu = raw_smp_processor_id(); data->cpu_entry.reserved = 0; } } void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { if (event->attr.sample_id_all) { header->size += event->id_header_size; __perf_event_header__init_id(data, event, event->attr.sample_type); } } static void __perf_event__output_id_sample(struct perf_output_handle *handle, struct perf_sample_data *data) { u64 sample_type = data->type; if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); } void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample) { if (event->attr.sample_id_all) __perf_event__output_id_sample(handle, sample); } static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[5]; int n = 0; values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; bool self = has_inherit_and_sample_read(&event->attr); /* * Disabling interrupts avoids all counter scheduling * (context switches, timer based rotation and IPIs). */ local_irq_save(flags); values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if ((leader != event) && (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) /* * XXX PERF_SAMPLE_READ vs inherited events seems difficult. * * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. * * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread * counts rather than attempting to accumulate some value across all children on * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we are called in * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); else perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { u64 sample_type = data->type; perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_IP) perf_output_put(handle, data->ip); if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(handle, data->addr); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(handle, data->period); if (sample_type & PERF_SAMPLE_READ) perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; size += data->callchain->nr; size *= sizeof(u64); __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { struct perf_raw_record *raw = data->raw; if (raw) { struct perf_raw_frag *frag = &raw->frag; perf_output_put(handle, raw->size); do { if (frag->copy) { __output_custom(handle, frag->copy, frag->data, frag->size); } else { __output_copy(handle, frag->data, frag->size); } if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); if (frag->pad) __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; u32 data; } raw = { .size = sizeof(u32), .data = 0, }; perf_output_put(handle, raw); } } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (data->br_stack) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); /* * Add the extension space which is appended * right after the struct perf_branch_stack. */ if (data->br_stack_cntr) { size = data->br_stack->nr * sizeof(u64); perf_output_copy(handle, data->br_stack_cntr, size); } } else { /* * we always store at least the value of nr */ u64 nr = 0; perf_output_put(handle, nr); } } if (sample_type & PERF_SAMPLE_REGS_USER) { u64 abi = data->regs_user.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_user; perf_output_sample_regs(handle, data->regs_user.regs, mask); } } if (sample_type & PERF_SAMPLE_STACK_USER) { perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); } if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); if (sample_type & PERF_SAMPLE_REGS_INTR) { u64 abi = data->regs_intr.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_intr; perf_output_sample_regs(handle, data->regs_intr.regs, mask); } } if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) perf_output_put(handle, data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) perf_output_put(handle, data->code_page_size); if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); if (data->aux_size) perf_aux_sample_output(event, handle, data); } if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; if (wakeup_events) { struct perf_buffer *rb = handle->rb; int events = local_inc_return(&rb->events); if (events >= wakeup_events) { local_sub(wakeup_events, &rb->events); local_inc(&rb->wakeup); } } } } static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; if (!virt) return 0; if (virt >= TASK_SIZE) { /* If it's vmalloc()d memory, leave phys_addr as 0 */ if (virt_addr_valid((void *)(uintptr_t)virt) && !(virt >= VMALLOC_START && virt < VMALLOC_END)) phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); } else { /* * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { struct page *p; pagefault_disable(); if (get_user_page_fast_only(virt, 0, &p)) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; put_page(p); } pagefault_enable(); } } return phys_addr; } /* * Return the pagetable size of a given virtual address. */ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { u64 size = 0; #ifdef CONFIG_HAVE_GUP_FAST pgd_t *pgdp, pgd; p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; pgdp = pgd_offset(mm, addr); pgd = READ_ONCE(*pgdp); if (pgd_none(pgd)) return 0; if (pgd_leaf(pgd)) return pgd_leaf_size(pgd); p4dp = p4d_offset_lockless(pgdp, pgd, addr); p4d = READ_ONCE(*p4dp); if (!p4d_present(p4d)) return 0; if (p4d_leaf(p4d)) return p4d_leaf_size(p4d); pudp = pud_offset_lockless(p4dp, p4d, addr); pud = READ_ONCE(*pudp); if (!pud_present(pud)) return 0; if (pud_leaf(pud)) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); again: pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; if (pmd_leaf(pmd)) return pmd_leaf_size(pmd); ptep = pte_offset_map(&pmd, addr); if (!ptep) goto again; pte = ptep_get_lockless(ptep); if (pte_present(pte)) size = __pte_leaf_size(pmd, pte); pte_unmap(ptep); #endif /* CONFIG_HAVE_GUP_FAST */ return size; } static u64 perf_get_page_size(unsigned long addr) { struct mm_struct *mm; unsigned long flags; u64 size; if (!addr) return 0; /* * Software page-table walkers must disable IRQs, * which prevents any tear down of the page tables. */ local_irq_save(flags); mm = current->mm; if (!mm) { /* * For kernel threads and the like, use init_mm so that * we can find kernel memory. */ mm = &init_mm; } size = perf_get_pgtable_size(mm, addr); local_irq_restore(flags); return size; } static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; if (!kernel && !user) return &__empty_callchain; callchain = get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) { return d * !!(flags & s); } void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; u64 filtered_sample_type; /* * Add the sample flags that are dependent to others. And clear the * sample flags that have already been done by the PMU driver. */ filtered_sample_type = sample_type; filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP); filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER); filtered_sample_type &= ~data->sample_flags; if (filtered_sample_type == 0) { /* Make sure it has the correct data->type for output */ data->type = event->attr.sample_type; return; } __perf_event_header__init_id(data, event, filtered_sample_type); if (filtered_sample_type & PERF_SAMPLE_IP) { data->ip = perf_instruction_pointer(event, regs); data->sample_flags |= PERF_SAMPLE_IP; } if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) perf_sample_save_callchain(data, event, regs); if (filtered_sample_type & PERF_SAMPLE_RAW) { data->raw = NULL; data->dyn_size += sizeof(u64); data->sample_flags |= PERF_SAMPLE_RAW; } if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { data->br_stack = NULL; data->dyn_size += sizeof(u64); data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (filtered_sample_type & PERF_SAMPLE_REGS_USER) perf_sample_regs_user(&data->regs_user, regs); /* * It cannot use the filtered_sample_type here as REGS_USER can be set * by STACK_USER (using __cond_set() above) and we don't want to update * the dyn_size if it's not requested by users. */ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); } data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_REGS_USER; } if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; u16 header_size = perf_sample_data_size(data, event); u16 size = sizeof(u64); stack_size = perf_sample_ustack_size(stack_size, header_size, data->regs_user.regs); /* * If there is something to dump, add space for the dump * itself and for the field that tells the dynamic size, * which is how many have been actually dumped. */ if (stack_size) size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_STACK_USER; } if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { data->weight.full = 0; data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = PERF_MEM_NA; data->sample_flags |= PERF_SAMPLE_DATA_SRC; } if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = 0; data->sample_flags |= PERF_SAMPLE_TRANSACTION; } if (filtered_sample_type & PERF_SAMPLE_ADDR) { data->addr = 0; data->sample_flags |= PERF_SAMPLE_ADDR; } if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); perf_sample_regs_intr(&data->regs_intr, regs); if (data->regs_intr.regs) { u64 mask = event->attr.sample_regs_intr; size += hweight64(mask) * sizeof(u64); } data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_REGS_INTR; } if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { data->phys_addr = perf_virt_to_phys(data->addr); data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; } #ifdef CONFIG_CGROUP_PERF if (filtered_sample_type & PERF_SAMPLE_CGROUP) { struct cgroup *cgrp; /* protected by RCU */ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; data->cgroup = cgroup_id(cgrp); data->sample_flags |= PERF_SAMPLE_CGROUP; } #endif /* * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, * but the value will not dump to the userspace. */ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { data->data_page_size = perf_get_page_size(data->addr); data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; } if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { data->code_page_size = perf_get_page_size(data->ip); data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; } if (filtered_sample_type & PERF_SAMPLE_AUX) { u64 size; u16 header_size = perf_sample_data_size(data, event); header_size += sizeof(u64); /* size */ /* * Given the 16bit nature of header::size, an AUX sample can * easily overflow it, what with all the preceding sample bits. * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary). */ size = min_t(size_t, U16_MAX - header_size, event->attr.aux_sample_size); size = rounddown(size, 8); size = perf_prepare_sample_aux(event, data, size); WARN_ON_ONCE(size + header_size > U16_MAX); data->dyn_size += size + sizeof(u64); /* size above */ data->sample_flags |= PERF_SAMPLE_AUX; } } void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { header->type = PERF_RECORD_SAMPLE; header->size = perf_sample_data_size(data, event); header->misc = perf_misc_flags(event, regs); /* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the * lowest 3 bits of size, which should be always zero at the moment. * This raises a more important question, do we really need 512k sized * samples and why, so good argumentation is in order for whatever you * do here next. */ WARN_ON_ONCE(header->size & 7); } static void __perf_event_aux_pause(struct perf_event *event, bool pause) { if (pause) { if (!event->hw.aux_paused) { event->hw.aux_paused = 1; event->pmu->stop(event, PERF_EF_PAUSE); } } else { if (event->hw.aux_paused) { event->hw.aux_paused = 0; event->pmu->start(event, PERF_EF_RESUME); } } } static void perf_event_aux_pause(struct perf_event *event, bool pause) { struct perf_buffer *rb; if (WARN_ON_ONCE(!event)) return; rb = ring_buffer_get(event); if (!rb) return; scoped_guard (irqsave) { /* * Guard against self-recursion here. Another event could trip * this same from NMI context. */ if (READ_ONCE(rb->aux_in_pause_resume)) break; WRITE_ONCE(rb->aux_in_pause_resume, 1); barrier(); __perf_event_aux_pause(event, pause); barrier(); WRITE_ONCE(rb->aux_in_pause_resume, 0); } ring_buffer_put(rb); } static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, struct perf_sample_data *, struct perf_event *, unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(data, event, regs); perf_prepare_header(&header, data, event, regs); err = output_begin(&handle, data, event, header.size); if (err) goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); exit: rcu_read_unlock(); return err; } void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_forward); } void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_backward); } int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_output(event, data, regs, perf_output_begin); } /* * read event_id */ struct perf_read_event { struct perf_event_header header; u32 pid; u32 tid; }; static void perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_read_event read_event = { .header = { .type = PERF_RECORD_READ, .misc = 0, .size = sizeof(read_event) + event->read_size, }, .pid = perf_event_pid(event, task), .tid = perf_event_tid(event, task), }; int ret; perf_event_header__init_id(&read_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; perf_output_put(&handle, read_event); perf_output_read(&handle, event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void perf_iterate_ctx(struct perf_event_context *ctx, perf_iterate_f output, void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (!all) { if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; } output(event, data); } } static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { /* * Skip events that are not fully formed yet; ensure that * if we observe event->ctx, both event and ctx will be * complete enough. See perf_install_in_context(). */ if (!smp_load_acquire(&event->ctx)) continue; if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; output(event, data); } } /* * Iterate all events that need to receive side-band events. * * For new callers; ensure that account_pmu_sb_event() includes * your event, otherwise it might not get delivered. */ static void perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; rcu_read_lock(); preempt_disable(); /* * If we have task_ctx != NULL we only notify the task context itself. * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { perf_iterate_ctx(task_ctx, output, data, false); goto done; } perf_iterate_sb_cpu(output, data); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); } /* * Clear all file-based filters at exec, they'll have to be * re-instated when/if these objects are mmapped again. */ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; restart++; } count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } void perf_event_exec(void) { struct perf_event_context *ctx; ctx = perf_pin_task_context(current); if (!ctx) return; perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); } struct remote_output { struct perf_buffer *rb; int err; }; static void __perf_event_output_stop(struct perf_event *event, void *data) { struct perf_event *parent = event->parent; struct remote_output *ro = data; struct perf_buffer *rb = ro->rb; struct stop_event_data sd = { .event = event, }; if (!has_aux(event)) return; if (!parent) parent = event; /* * In case of inheritance, it will be the parent that links to the * ring-buffer, but it will be the child that's actually using it. * * We are using event::rb to determine if the event should be stopped, * however this may race with ring_buffer_attach() (through set_output), * which will make us skip the event that actually needs to be stopped. * So ring_buffer_attach() has to stop an aux event before re-assigning * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); } static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; rcu_read_lock(); perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); return ro.err; } static void perf_pmu_output_stop(struct perf_event *event) { struct perf_event *iter; int err, cpu; restart: rcu_read_lock(); list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { /* * For per-CPU events, we need to make sure that neither they * nor their children are running; for cpu==-1 events it's * sufficient to stop the event itself if it's active, since * it can't have children. */ cpu = iter->cpu; if (cpu == -1) cpu = READ_ONCE(iter->oncpu); if (cpu == -1) continue; err = cpu_function_call(cpu, __perf_pmu_output_stop, event); if (err == -EAGAIN) { rcu_read_unlock(); goto restart; } } rcu_read_unlock(); } /* * task tracking -- fork/exit * * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task */ struct perf_task_event { struct task_struct *task; struct perf_event_context *task_ctx; struct { struct perf_event_header header; u32 pid; u32 ppid; u32 tid; u32 ptid; u64 time; } event_id; }; static int perf_event_task_match(struct perf_event *event) { return event->attr.comm || event->attr.mmap || event->attr.mmap2 || event->attr.mmap_data || event->attr.task; } static void perf_event_task_output(struct perf_event *event, void *data) { struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; int ret, size = task_event->event_id.header.size; if (!perf_event_task_match(event)) return; perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.tid = perf_event_tid(event, task); if (task_event->event_id.header.type == PERF_RECORD_EXIT) { task_event->event_id.ppid = perf_event_pid(event, task->real_parent); task_event->event_id.ptid = perf_event_pid(event, task->real_parent); } else { /* PERF_RECORD_FORK */ task_event->event_id.ppid = perf_event_pid(event, current); task_event->event_id.ptid = perf_event_tid(event, current); } task_event->event_id.time = perf_event_clock(event); perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: task_event->event_id.header.size = size; } static void perf_event_task(struct task_struct *task, struct perf_event_context *task_ctx, int new) { struct perf_task_event task_event; if (!atomic_read(&nr_comm_events) && !atomic_read(&nr_mmap_events) && !atomic_read(&nr_task_events)) return; task_event = (struct perf_task_event){ .task = task, .task_ctx = task_ctx, .event_id = { .header = { .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, .misc = 0, .size = sizeof(task_event.event_id), }, /* .pid */ /* .ppid */ /* .tid */ /* .ptid */ /* .time */ }, }; perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); } /* * comm tracking */ struct perf_comm_event { struct task_struct *task; char *comm; int comm_size; struct { struct perf_event_header header; u32 pid; u32 tid; } event_id; }; static int perf_event_comm_match(struct perf_event *event) { return event->attr.comm; } static void perf_event_comm_output(struct perf_event *event, void *data) { struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; int ret; if (!perf_event_comm_match(event)) return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) goto out; comm_event->event_id.pid = perf_event_pid(event, comm_event->task); comm_event->event_id.tid = perf_event_tid(event, comm_event->task); perf_output_put(&handle, comm_event->event_id); __output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: comm_event->event_id.header.size = size; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { char comm[TASK_COMM_LEN]; unsigned int size; memset(comm, 0, sizeof(comm)); strscpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; if (!atomic_read(&nr_comm_events)) return; comm_event = (struct perf_comm_event){ .task = task, /* .comm */ /* .comm_size */ .event_id = { .header = { .type = PERF_RECORD_COMM, .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ /* .tid */ }, }; perf_event_comm_event(&comm_event); } /* * namespaces tracking */ struct perf_namespaces_event { struct task_struct *task; struct { struct perf_event_header header; u32 pid; u32 tid; u64 nr_namespaces; struct perf_ns_link_info link_info[NR_NAMESPACES]; } event_id; }; static int perf_event_namespaces_match(struct perf_event *event) { return event->attr.namespaces; } static void perf_event_namespaces_output(struct perf_event *event, void *data) { struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) return; perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); namespaces_event->event_id.tid = perf_event_tid(event, namespaces_event->task); perf_output_put(&handle, namespaces_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct path ns_path; struct inode *ns_inode; int error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; path_put(&ns_path); } } void perf_event_namespaces(struct task_struct *task) { struct perf_namespaces_event namespaces_event; struct perf_ns_link_info *ns_link_info; if (!atomic_read(&nr_namespaces_events)) return; namespaces_event = (struct perf_namespaces_event){ .task = task, .event_id = { .header = { .type = PERF_RECORD_NAMESPACES, .misc = 0, .size = sizeof(namespaces_event.event_id), }, /* .pid */ /* .tid */ .nr_namespaces = NR_NAMESPACES, /* .link_info[NR_NAMESPACES] */ }, }; ns_link_info = namespaces_event.event_id.link_info; perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], task, &mntns_operations); #ifdef CONFIG_USER_NS perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], task, &userns_operations); #endif #ifdef CONFIG_NET_NS perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], task, &netns_operations); #endif #ifdef CONFIG_UTS_NS perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], task, &utsns_operations); #endif #ifdef CONFIG_IPC_NS perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], task, &ipcns_operations); #endif #ifdef CONFIG_PID_NS perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], task, &pidns_operations); #endif #ifdef CONFIG_CGROUPS perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], task, &cgroupns_operations); #endif perf_iterate_sb(perf_event_namespaces_output, &namespaces_event, NULL); } /* * cgroup tracking */ #ifdef CONFIG_CGROUP_PERF struct perf_cgroup_event { char *path; int path_size; struct { struct perf_event_header header; u64 id; char path[]; } event_id; }; static int perf_event_cgroup_match(struct perf_event *event) { return event->attr.cgroup; } static void perf_event_cgroup_output(struct perf_event *event, void *data) { struct perf_cgroup_event *cgroup_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = cgroup_event->event_id.header.size; int ret; if (!perf_event_cgroup_match(event)) return; perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; perf_output_put(&handle, cgroup_event->event_id); __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: cgroup_event->event_id.header.size = header_size; } static void perf_event_cgroup(struct cgroup *cgrp) { struct perf_cgroup_event cgroup_event; char path_enomem[16] = "//enomem"; char *pathname; size_t size; if (!atomic_read(&nr_cgroup_events)) return; cgroup_event = (struct perf_cgroup_event){ .event_id = { .header = { .type = PERF_RECORD_CGROUP, .misc = 0, .size = sizeof(cgroup_event.event_id), }, .id = cgroup_id(cgrp), }, }; pathname = kmalloc(PATH_MAX, GFP_KERNEL); if (pathname == NULL) { cgroup_event.path = path_enomem; } else { /* just to be sure to have enough space for alignment */ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); cgroup_event.path = pathname; } /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(cgroup_event.path) + 1; while (!IS_ALIGNED(size, sizeof(u64))) cgroup_event.path[size++] = '\0'; cgroup_event.event_id.header.size += size; cgroup_event.path_size = size; perf_iterate_sb(perf_event_cgroup_output, &cgroup_event, NULL); kfree(pathname); } #endif /* * mmap tracking */ struct perf_mmap_event { struct vm_area_struct *vma; const char *file_name; int file_size; int maj, min; u64 ino; u64 ino_generation; u32 prot, flags; u8 build_id[BUILD_ID_SIZE_MAX]; u32 build_id_size; struct { struct perf_event_header header; u32 pid; u32 tid; u64 start; u64 len; u64 pgoff; } event_id; }; static int perf_event_mmap_match(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct vm_area_struct *vma = mmap_event->vma; int executable = vma->vm_flags & VM_EXEC; return (!executable && event->attr.mmap_data) || (executable && (event->attr.mmap || event->attr.mmap2)); } static void perf_event_mmap_output(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) return; if (event->attr.mmap2) { mmap_event->event_id.header.type = PERF_RECORD_MMAP2; mmap_event->event_id.header.size += sizeof(mmap_event->maj); mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); mmap_event->event_id.header.size += sizeof(mmap_event->prot); mmap_event->event_id.header.size += sizeof(mmap_event->flags); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); use_build_id = event->attr.build_id && mmap_event->build_id_size; if (event->attr.mmap2 && use_build_id) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { if (use_build_id) { u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; __output_copy(&handle, size, 4); __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); } else { perf_output_put(&handle, mmap_event->maj); perf_output_put(&handle, mmap_event->min); perf_output_put(&handle, mmap_event->ino); perf_output_put(&handle, mmap_event->ino_generation); } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: mmap_event->event_id.header.size = size; mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; int maj = 0, min = 0; u64 ino = 0, gen = 0; u32 prot = 0, flags = 0; unsigned int size; char tmp[16]; char *buf = NULL; char *name = NULL; if (vma->vm_flags & VM_READ) prot |= PROT_READ; if (vma->vm_flags & VM_WRITE) prot |= PROT_WRITE; if (vma->vm_flags & VM_EXEC) prot |= PROT_EXEC; if (vma->vm_flags & VM_MAYSHARE) flags = MAP_SHARED; else flags = MAP_PRIVATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) { name = "//enomem"; goto cpy_name; } /* * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); goto got_name; } else { if (vma->vm_ops && vma->vm_ops->name) name = (char *) vma->vm_ops->name(vma); if (!name) name = (char *)arch_vma_name(vma); if (!name) { if (vma_is_initial_heap(vma)) name = "[heap]"; else if (vma_is_initial_stack(vma)) name = "[stack]"; else name = "//anon"; } } cpy_name: strscpy(tmp, name, sizeof(tmp)); name = tmp; got_name: /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(name)+1; while (!IS_ALIGNED(size, sizeof(u64))) name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; mmap_event->maj = maj; mmap_event->min = min; mmap_event->ino = ino; mmap_event->ino_generation = gen; mmap_event->prot = prot; mmap_event->flags = flags; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; if (atomic_read(&nr_build_id_events)) build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); kfree(buf); } /* * Check whether inode and address range match filter criteria. */ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { /* d_inode(NULL) won't be equal to any mapped user-space file */ if (!filter->path.dentry) return false; if (d_inode(filter->path.dentry) != file_inode(file)) return false; if (filter->offset > offset + size) return false; if (filter->offset + filter->size < offset) return false; return true; } static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, struct vm_area_struct *vma, struct perf_addr_filter_range *fr) { unsigned long vma_size = vma->vm_end - vma->vm_start; unsigned long off = vma->vm_pgoff << PAGE_SHIFT; struct file *file = vma->vm_file; if (!perf_addr_filter_match(filter, file, off, vma_size)) return false; if (filter->offset < off) { fr->start = vma->vm_start; fr->size = min(vma_size, filter->size - (off - filter->offset)); } else { fr->start = vma->vm_start + filter->offset - off; fr->size = min(vma->vm_end - fr->start, filter->size); } return true; } static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (perf_addr_filter_vma_adjust(filter, vma, &event->addr_filter_ranges[count])) restart++; count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } /* * Adjust all task's events' filters to the new vma */ static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; /* * Data tracing isn't supported yet and as such there is no need * to keep track of anything that isn't related to executable code: */ if (!(vma->vm_flags & VM_EXEC)) return; rcu_read_lock(); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); rcu_read_unlock(); } void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; if (!atomic_read(&nr_mmap_events)) return; mmap_event = (struct perf_mmap_event){ .vma = vma, /* .file_name */ /* .file_size */ .event_id = { .header = { .type = PERF_RECORD_MMAP, .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, /* .maj (attr_mmap2 only) */ /* .min (attr_mmap2 only) */ /* .ino (attr_mmap2 only) */ /* .ino_generation (attr_mmap2 only) */ /* .prot (attr_mmap2 only) */ /* .flags (attr_mmap2 only) */ }; perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 offset; u64 size; u64 flags; } rec = { .header = { .type = PERF_RECORD_AUX, .misc = 0, .size = sizeof(rec), }, .offset = head, .size = size, .flags = flags, }; int ret; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * Lost/dropped samples logging */ void perf_log_lost_samples(struct perf_event *event, u64 lost) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 lost; } lost_samples_event = { .header = { .type = PERF_RECORD_LOST_SAMPLES, .misc = 0, .size = sizeof(lost_samples_event), }, .lost = lost, }; perf_event_header__init_id(&lost_samples_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; perf_output_put(&handle, lost_samples_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * context_switch tracking */ struct perf_switch_event { struct task_struct *task; struct task_struct *next_prev; struct { struct perf_event_header header; u32 next_prev_pid; u32 next_prev_tid; } event_id; }; static int perf_event_switch_match(struct perf_event *event) { return event->attr.context_switch; } static void perf_event_switch_output(struct perf_event *event, void *data) { struct perf_switch_event *se = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_switch_match(event)) return; /* Only CPU-wide events are allowed to see next/prev pid/tid */ if (event->ctx->task) { se->event_id.header.type = PERF_RECORD_SWITCH; se->event_id.header.size = sizeof(se->event_id.header); } else { se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; se->event_id.header.size = sizeof(se->event_id); se->event_id.next_prev_pid = perf_event_pid(event, se->next_prev); se->event_id.next_prev_tid = perf_event_tid(event, se->next_prev); } perf_event_header__init_id(&se->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; if (event->ctx->task) perf_output_put(&handle, se->event_id.header); else perf_output_put(&handle, se->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in) { struct perf_switch_event switch_event; /* N.B. caller checks nr_switch_events != 0 */ switch_event = (struct perf_switch_event){ .task = task, .next_prev = next_prev, .event_id = { .header = { /* .type */ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, /* .size */ }, /* .next_prev_pid */ /* .next_prev_tid */ }, }; if (!sched_in && task_is_runnable(task)) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } /* * IRQ throttle logging */ static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 time; u64 id; u64 stream_id; } throttle_event = { .header = { .type = PERF_RECORD_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; if (enable) throttle_event.header.type = PERF_RECORD_UNTHROTTLE; perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; perf_output_put(&handle, throttle_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * ksymbol register/unregister tracking */ struct perf_ksymbol_event { const char *name; int name_len; struct { struct perf_event_header header; u64 addr; u32 len; u16 ksym_type; u16 flags; } event_id; }; static int perf_event_ksymbol_match(struct perf_event *event) { return event->attr.ksymbol; } static void perf_event_ksymbol_output(struct perf_event *event, void *data) { struct perf_ksymbol_event *ksymbol_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_ksymbol_match(event)) return; perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; perf_output_put(&handle, ksymbol_event->event_id); __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { struct perf_ksymbol_event ksymbol_event; char name[KSYM_NAME_LEN]; u16 flags = 0; int name_len; if (!atomic_read(&nr_ksymbol_events)) return; if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; strscpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); if (unregister) flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; ksymbol_event = (struct perf_ksymbol_event){ .name = name, .name_len = name_len, .event_id = { .header = { .type = PERF_RECORD_KSYMBOL, .size = sizeof(ksymbol_event.event_id) + name_len, }, .addr = addr, .len = len, .ksym_type = ksym_type, .flags = flags, }, }; perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); return; err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } /* * bpf program load/unload tracking */ struct perf_bpf_event { struct bpf_prog *prog; struct { struct perf_event_header header; u16 type; u16 flags; u32 id; u8 tag[BPF_TAG_SIZE]; } event_id; }; static int perf_event_bpf_match(struct perf_event *event) { return event->attr.bpf_event; } static void perf_event_bpf_output(struct perf_event *event, void *data) { struct perf_bpf_event *bpf_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_bpf_match(event)) return; perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, bpf_event->event_id.header.size); if (ret) return; perf_output_put(&handle, bpf_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, enum perf_bpf_event_type type) { bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; int i; perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)prog->bpf_func, prog->jited_len, unregister, prog->aux->ksym.name); for (i = 1; i < prog->aux->func_cnt; i++) { struct bpf_prog *subprog = prog->aux->func[i]; perf_event_ksymbol( PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, subprog->jited_len, unregister, subprog->aux->ksym.name); } } void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { struct perf_bpf_event bpf_event; switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: if (atomic_read(&nr_ksymbol_events)) perf_event_bpf_emit_ksymbols(prog, type); break; default: return; } if (!atomic_read(&nr_bpf_events)) return; bpf_event = (struct perf_bpf_event){ .prog = prog, .event_id = { .header = { .type = PERF_RECORD_BPF_EVENT, .size = sizeof(bpf_event.event_id), }, .type = type, .flags = flags, .id = prog->aux->id, }, }; BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; size_t pad; u16 old_len; u16 new_len; struct { struct perf_event_header header; u64 addr; } event_id; }; static int perf_event_text_poke_match(struct perf_event *event) { return event->attr.text_poke; } static void perf_event_text_poke_output(struct perf_event *event, void *data) { struct perf_text_poke_event *text_poke_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u64 padding = 0; int ret; if (!perf_event_text_poke_match(event)) return; perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, text_poke_event->event_id.header.size); if (ret) return; perf_output_put(&handle, text_poke_event->event_id); perf_output_put(&handle, text_poke_event->old_len); perf_output_put(&handle, text_poke_event->new_len); __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); if (text_poke_event->pad) __output_copy(&handle, &padding, text_poke_event->pad); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { struct perf_text_poke_event text_poke_event; size_t tot, pad; if (!atomic_read(&nr_text_poke_events)) return; tot = sizeof(text_poke_event.old_len) + old_len; tot += sizeof(text_poke_event.new_len) + new_len; pad = ALIGN(tot, sizeof(u64)) - tot; text_poke_event = (struct perf_text_poke_event){ .old_bytes = old_bytes, .new_bytes = new_bytes, .pad = pad, .old_len = old_len, .new_len = new_len, .event_id = { .header = { .type = PERF_RECORD_TEXT_POKE, .misc = PERF_RECORD_MISC_KERNEL, .size = sizeof(text_poke_event.event_id) + tot + pad, }, .addr = (unsigned long)addr, }, }; perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); } void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; } static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u32 pid; u32 tid; } rec; int ret; if (event->parent) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.pid = perf_event_pid(event, current); rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 hw_id; } rec; int ret; if (event->parent) event = event->parent; rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.hw_id = hw_id; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } EXPORT_SYMBOL_GPL(perf_report_aux_output_id); static int __perf_event_account_interrupt(struct perf_event *event, int throttle) { struct hw_perf_event *hwc = &event->hw; int ret = 0; u64 seq; seq = __this_cpu_read(perf_throttled_seq); if (seq != hwc->interrupts_seq) { hwc->interrupts_seq = seq; hwc->interrupts = 1; } else { hwc->interrupts++; if (unlikely(throttle && hwc->interrupts > max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } } if (event->attr.freq) { u64 now = perf_clock(); s64 delta = now - hwc->freq_time_stamp; hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) perf_adjust_period(event, delta, hwc->last_period, true); } return ret; } int perf_event_account_interrupt(struct perf_event *event) { return __perf_event_account_interrupt(event, 1); } static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) { /* * Due to interrupt latency (AKA "skid"), we may enter the * kernel before taking an overflow, even if the PMU is only * counting user events. */ if (event->attr.exclude_kernel && !user_mode(regs)) return false; return true; } #ifdef CONFIG_BPF_SYSCALL static int bpf_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct bpf_perf_event_data_kern ctx = { .data = data, .event = event, }; struct bpf_prog *prog; int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); if (prog) { perf_prepare_sample(data, event, regs); ret = bpf_prog_run(prog, &ctx); } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); return ret; } static inline int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ return -EINVAL; if (event->prog) return -EEXIST; if (prog->type != BPF_PROG_TYPE_PERF_EVENT) return -EINVAL; if (event->attr.precise_ip && prog->call_get_stack && (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* * On perf_event with precise_ip, calling bpf_get_stack() * may trigger unwinder warnings and occasional crashes. * bpf_get_[stack|stackid] works around this issue by using * callchain attached to perf_sample_data. If the * perf_event does not full (kernel and user) callchain * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid]. */ return -EPROTO; } event->prog = prog; event->bpf_cookie = bpf_cookie; return 0; } static inline void perf_event_free_bpf_handler(struct perf_event *event) { struct bpf_prog *prog = event->prog; if (!prog) return; event->prog = NULL; bpf_prog_put(prog); } #else static inline int bpf_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return 1; } static inline int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } static inline void perf_event_free_bpf_handler(struct perf_event *event) { } #endif /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; /* * Non-sampling counters might still use the PMI to fold short * hardware counters, ignore those. */ if (unlikely(!is_sampling_event(event))) return 0; ret = __perf_event_account_interrupt(event, throttle); if (event->attr.aux_pause) perf_event_aux_pause(event->aux_event, true); if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && !bpf_overflow_handler(event, data, regs)) goto out; /* * XXX event_limit might not quite work as expected on inherited * events */ event->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; perf_event_disable_inatomic(event); } if (event->attr.sigtrap) { /* * The desired behaviour of sigtrap vs invalid samples is a bit * tricky; on the one hand, one should not loose the SIGTRAP if * it is the first event, on the other hand, we should also not * trigger the WARN or override the data address. */ bool valid_sample = sample_is_allowed(event, regs); unsigned int pending_id = 1; enum task_work_notify_mode notify_mode; if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; if (!event->pending_work && !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; local_inc(&event->ctx->nr_no_switch_fast); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) event->pending_addr = data->addr; } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without * consuming pending_work; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. * * 2. Events that can overflow again before the IRQ- * work without user space progress (e.g. hrtimer). * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ WARN_ON_ONCE(event->pending_work != pending_id); } } READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending_irq); } out: if (event->attr.aux_resume) perf_event_aux_pause(event->aux_event, false); return ret; } int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } /* * Generic software event infrastructure */ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event * is kept in the range [-sample_period, 0] so that we can use the * sign as trigger. */ u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; u64 nr, offset; s64 old, val; hwc->last_period = hwc->sample_period; old = local64_read(&hwc->period_left); do { val = old; if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; } while (!local64_try_cmpxchg(&hwc->period_left, &old, val)); return nr; } static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; if (!overflow) overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; for (; overflow; overflow--) { if (__perf_event_overflow(event, throttle, data, regs)) { /* * We inhibit the overflow from happening when * hwc->interrupts == MAX_INTERRUPTS. */ break; } throttle = 1; } } static void perf_swevent_event(struct perf_event *event, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; local64_add(nr, &event->count); if (!regs) return; if (!is_sampling_event(event)) return; if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { data->period = nr; return perf_swevent_overflow(event, 1, data, regs); } else data->period = event->hw.last_period; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, data, regs); } static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; if (event->attr.exclude_kernel && !user_mode(regs)) return 1; } return 0; } static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { if (event->attr.type != type) return 0; if (event->attr.config != event_id) return 0; if (perf_exclude_event(event, regs)) return 0; return 1; } static inline u64 swevent_hash(u64 type, u32 event_id) { u64 val = event_id | (type << 32); return hash_64(val, SWEVENT_HLIST_BITS); } static inline struct hlist_head * __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) { u64 hash = swevent_hash(type, event_id); return &hlist->heads[hash]; } /* For the read side: events when they trigger */ static inline struct hlist_head * find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; u64 type = event->attr.type; /* * Event scheduling is always serialized against hlist allocation * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; rcu_read_lock(); head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_event(event, nr, data, regs); } end: rcu_read_unlock(); } DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { return get_recursion_context(current->perf_recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { put_recursion_context(current->perf_recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); } void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { int rctx; preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (unlikely(rctx < 0)) goto fail; ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); fail: preempt_enable_notrace(); } static void perf_swevent_read(struct perf_event *event) { } static int perf_swevent_add(struct perf_event *event, int flags) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; if (is_sampling_event(event)) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; hlist_add_head_rcu(&event->hlist_entry, head); perf_event_update_userpage(event); return 0; } static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } static void perf_swevent_start(struct perf_event *event, int flags) { event->hw.state = 0; } static void perf_swevent_stop(struct perf_event *event, int flags) { event->hw.state = PERF_HES_STOPPED; } /* Deref the hlist from the update side */ static inline struct swevent_hlist * swevent_hlist_deref(struct swevent_htable *swhash) { return rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release(struct swevent_htable *swhash) { struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; RCU_INIT_POINTER(swhash->swevent_hlist, NULL); kfree_rcu(hlist, rcu_head); } static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (!--swhash->hlist_refcount) swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) swevent_hlist_put_cpu(cpu); } static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; mutex_lock(&swhash->hlist_mutex); if (!swevent_hlist_deref(swhash) && cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); if (!hlist) { err = -ENOMEM; goto exit; } rcu_assign_pointer(swhash->swevent_hlist, hlist); } swhash->hlist_refcount++; exit: mutex_unlock(&swhash->hlist_mutex); return err; } static int swevent_hlist_get(void) { int err, cpu, failed_cpu; mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; } } mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; swevent_hlist_put_cpu(cpu); } mutex_unlock(&pmus_lock); return err; } struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { u64 event_id = event->attr.config; WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(); } static struct pmu perf_cpu_clock; /* fwd declaration */ static struct pmu perf_task_clock; static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: event->attr.type = perf_cpu_clock.type; return -ENOENT; case PERF_COUNT_SW_TASK_CLOCK: event->attr.type = perf_task_clock.type; return -ENOENT; default: break; } if (event_id >= PERF_COUNT_SW_MAX) return -ENOENT; if (!event->parent) { int err; err = swevent_hlist_get(); if (err) return err; static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } return 0; } static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; #ifdef CONFIG_EVENT_TRACING static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_destroy(event); } static int perf_tp_event_init(struct perf_event *event) { int err; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; /* * no branch sampling for tracepoint events */ if (has_branch_stack(event)) return -EOPNOTSUPP; err = perf_trace_init(event); if (err) return err; event->destroy = tp_perf_event_destroy; return 0; } static struct pmu perf_tracepoint = { .task_ctx_nr = perf_sw_context, .event_init = perf_tp_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) event = event->parent; if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; } static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 0; /* * If exclude_kernel, only trace user-space tracepoints (uprobes) */ if (event->attr.exclude_kernel && !user_mode(regs)) return 0; if (!perf_tp_filter_match(event, data)) return 0; return 1; } void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, rctx, task); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, struct perf_event *event) { struct trace_entry *entry = record; if (event->attr.config != entry->type) return; /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; if (perf_tp_event_match(event, data, regs)) perf_swevent_event(event, count, data, regs); } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); struct pmu *pmu = &perf_tracepoint; struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { __perf_tp_event_target_task(count, record, regs, data, event); for_each_sibling_event(sibling, event) __perf_tp_event_target_task(count, record, regs, data, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { __perf_tp_event_target_task(count, record, regs, data, event); for_each_sibling_event(sibling, event) __perf_tp_event_target_task(count, record, regs, data, sibling); } } void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; struct perf_raw_record raw = { .frag = { .size = entry_size, .data = record, }, }; perf_sample_data_init(&data, 0, 0); perf_sample_save_raw_data(&data, &raw); perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) { perf_swevent_event(event, count, &data, regs); /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and * need to be re-computed for different sweveents. * Re-initialize data->sample_flags safely to avoid * the problem that next event skips preparing data * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); perf_sample_save_raw_data(&data, &raw); } } /* * If we got specified a target task, also iterate its context and * deliver this event there too. */ if (task && task != current) { struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; raw_spin_lock(&ctx->lock); perf_tp_event_target_task(count, record, regs, &data, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe * The flags should match following PMU_FORMAT_ATTR(). * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe * * The following values specify a reference counter (or semaphore in the * terminology of tools like dtrace, systemtap, etc.) Userspace Statically * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. * * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); #endif #ifdef CONFIG_KPROBE_EVENTS static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; static struct attribute_group kprobe_format_group = { .name = "format", .attrs = kprobe_attrs, }; static const struct attribute_group *kprobe_attr_groups[] = { &kprobe_format_group, NULL, }; static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_kprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) { int err; bool is_retprobe; if (event->attr.type != perf_kprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; err = perf_kprobe_init(event, is_retprobe); if (err) return err; event->destroy = perf_kprobe_destroy; return 0; } #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); static struct attribute *uprobe_attrs[] = { &format_attr_retprobe.attr, &format_attr_ref_ctr_offset.attr, NULL, }; static struct attribute_group uprobe_format_group = { .name = "format", .attrs = uprobe_attrs, }; static const struct attribute_group *uprobe_attr_groups[] = { &uprobe_format_group, NULL, }; static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_uprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; event->destroy = perf_uprobe_destroy; return 0; } #endif /* CONFIG_UPROBE_EVENTS */ static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); #ifdef CONFIG_KPROBE_EVENTS perf_pmu_register(&perf_kprobe, "kprobe", -1); #endif #ifdef CONFIG_UPROBE_EVENTS perf_pmu_register(&perf_uprobe, "uprobe", -1); #endif } static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); } /* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open() */ static inline bool perf_event_is_tracing(struct perf_event *event) { if (event->pmu == &perf_tracepoint) return true; #ifdef CONFIG_KPROBE_EVENTS if (event->pmu == &perf_kprobe) return true; #endif #ifdef CONFIG_UPROBE_EVENTS if (event->pmu == &perf_uprobe) return true; #endif return false; } int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe) /* only uprobe programs are allowed to be sleepable */ return -EINVAL; /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) return -EACCES; } return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } perf_event_detach_bpf_prog(event); } #else static inline void perf_tp_register(void) { } static void perf_event_free_filter(struct perf_event *event) { } int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; } void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT void perf_bp_event(struct perf_event *bp, void *data) { struct perf_sample_data sample; struct pt_regs *regs = data; perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) perf_swevent_event(bp, 1, &sample, regs); } #endif /* * Allocate a new address filter */ static struct perf_addr_filter * perf_addr_filter_new(struct perf_event *event, struct list_head *filters) { int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); struct perf_addr_filter *filter; filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); if (!filter) return NULL; INIT_LIST_HEAD(&filter->entry); list_add_tail(&filter->entry, filters); return filter; } static void free_filters_list(struct list_head *filters) { struct perf_addr_filter *filter, *iter; list_for_each_entry_safe(filter, iter, filters, entry) { path_put(&filter->path); list_del(&filter->entry); kfree(filter); } } /* * Free existing address filters and optionally install new ones */ static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head) { unsigned long flags; LIST_HEAD(list); if (!has_addr_filter(event)) return; /* don't bother with children, they don't have their own filters */ if (event->parent) return; raw_spin_lock_irqsave(&event->addr_filters.lock, flags); list_splice_init(&event->addr_filters.list, &list); if (head) list_splice(head, &event->addr_filters.list); raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); free_filters_list(&list); } /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (perf_addr_filter_vma_adjust(filter, vma, fr)) return; } } /* * Update event's address range filters based on the * task's existing mappings, if any. */ static void perf_event_addr_filters_apply(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct task_struct *task = READ_ONCE(event->ctx->task); struct perf_addr_filter *filter; struct mm_struct *mm = NULL; unsigned int count = 0; unsigned long flags; /* * We may observe TASK_TOMBSTONE, which means that the event tear-down * will stop on the parent's child_mutex that our caller is also holding */ if (task == TASK_TOMBSTONE) return; if (ifh->nr_file_filters) { mm = get_task_mm(task); if (!mm) goto restart; mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { /* * Adjust base offset if the filter is associated to a * binary that needs to be mapped: */ event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); } else { event->addr_filter_ranges[count].start = filter->offset; event->addr_filter_ranges[count].size = filter->size; } count++; } event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { mmap_read_unlock(mm); mmput(mm); } restart: perf_event_stop(event, 1); } /* * Address range filtering: limiting the data to certain * instruction address ranges. Filters are ioctl()ed to us from * userspace as ascii strings. * * Filter string format: * * ACTION RANGE_SPEC * where ACTION is one of the * * "filter": limit the trace to this region * * "start": start tracing from this address * * "stop": stop tracing at this address/region; * RANGE_SPEC is * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * * if <size> is not specified or is zero, the range is treated as a single * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, IF_SRC_FILE, IF_SRC_KERNEL, IF_SRC_FILEADDR, IF_SRC_KERNELADDR, }; enum { IF_STATE_ACTION = 0, IF_STATE_SOURCE, IF_STATE_END, }; static const match_table_t if_tokens = { { IF_ACT_FILTER, "filter" }, { IF_ACT_START, "start" }, { IF_ACT_STOP, "stop" }, { IF_SRC_FILE, "%u/%u@%s" }, { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, { IF_ACT_NONE, NULL }, }; /* * Address filter string parser */ static int perf_event_parse_addr_filter(struct perf_event *event, char *fstr, struct list_head *filters) { struct perf_addr_filter *filter = NULL; char *start, *orig, *filename = NULL; substring_t args[MAX_OPT_ARGS]; int state = IF_STATE_ACTION, token; unsigned int kernel = 0; int ret = -EINVAL; orig = fstr = kstrdup(fstr, GFP_KERNEL); if (!fstr) return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { static const enum perf_addr_filter_action_t actions[] = { [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, }; ret = -EINVAL; if (!*start) continue; /* filter definition begins */ if (state == IF_STATE_ACTION) { filter = perf_addr_filter_new(event, filters); if (!filter) goto fail; } token = match_token(start, if_tokens, args); switch (token) { case IF_ACT_FILTER: case IF_ACT_START: case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; filter->action = actions[token]; state = IF_STATE_SOURCE; break; case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: if (state != IF_STATE_SOURCE) goto fail; *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) goto fail; } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; } } state = IF_STATE_END; break; default: goto fail; } /* * Filter definition is fully parsed, validate and install it. * Make sure that it doesn't contradict itself or the event's * attribute. */ if (state == IF_STATE_END) { ret = -EINVAL; /* * ACTION "filter" must have a non-zero length region * specified. */ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && !filter->size) goto fail; if (!kernel) { if (!filename) goto fail; /* * For now, we only support file-based filters * in per-task events; doing so for CPU-wide * events requires additional context switching * trickery, since same object code will be * mapped at different virtual addresses in * different processes. */ ret = -EOPNOTSUPP; if (!event->ctx->task) goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) goto fail; ret = -EINVAL; if (!filter->path.dentry || !S_ISREG(d_inode(filter->path.dentry) ->i_mode)) goto fail; event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ kfree(filename); filename = NULL; state = IF_STATE_ACTION; filter = NULL; kernel = 0; } } if (state != IF_STATE_ACTION) goto fail; kfree(filename); kfree(orig); return 0; fail: kfree(filename); free_filters_list(filters); kfree(orig); return ret; } static int perf_event_set_addr_filter(struct perf_event *event, char *filter_str) { LIST_HEAD(filters); int ret; /* * Since this is called in perf_ioctl() path, we're already holding * ctx::mutex. */ lockdep_assert_held(&event->ctx->mutex); if (WARN_ON_ONCE(event->parent)) return -EINVAL; ret = perf_event_parse_addr_filter(event, filter_str, &filters); if (ret) goto fail_clear_files; ret = event->pmu->addr_filters_validate(&filters); if (ret) goto fail_free_filters; /* remove existing filters, if any */ perf_addr_filters_splice(event, &filters); /* install new filters */ perf_event_for_each_child(event, perf_event_addr_filters_apply); return ret; fail_free_filters: free_filters_list(&filters); fail_clear_files: event->addr_filters.nr_file_filters = 0; return ret; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) { int ret = -EINVAL; char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); #ifdef CONFIG_EVENT_TRACING if (perf_event_is_tracing(event)) { struct perf_event_context *ctx = event->ctx; /* * Beware, here be dragons!! * * the tracepoint muck will deadlock against ctx->mutex, but * the tracepoint stuff does not actually need it. So * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we * already have a reference on ctx. * * This can result in event getting moved to a different ctx, * but that does not affect the tracepoint state. */ mutex_unlock(&ctx->mutex); ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); mutex_lock(&ctx->mutex); } else #endif if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); return ret; } /* * hrtimer based swevent callback */ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; struct pt_regs *regs; struct perf_event *event; u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); if (event->state != PERF_EVENT_STATE_ACTIVE) return HRTIMER_NORESTART; event->pmu->read(event); perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) if (__perf_event_overflow(event, 1, &data, regs)) ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } static void perf_swevent_start_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 period; if (!is_sampling_event(event)) return; period = local64_read(&hwc->period_left); if (period) { if (period < 0) period = 10000; local64_set(&hwc->period_left, 0); } else { period = max_t(u64, 10000, hwc->sample_period); } hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED_HARD); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (is_sampling_event(event)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); hrtimer_cancel(&hwc->hrtimer); } } static void perf_swevent_init_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!is_sampling_event(event)) return; hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hwc->hrtimer.function = perf_swevent_hrtimer; /* * Since hrtimers have a fixed rate, we can do a static freq->period * mapping and avoid the whole period adjust feedback stuff. */ if (event->attr.freq) { long freq = event->attr.sample_freq; event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } /* * Software event: cpu wall time clock */ static void cpu_clock_event_update(struct perf_event *event) { s64 prev; u64 now; now = local_clock(); prev = local64_xchg(&event->hw.prev_count, now); local64_add(now - prev, &event->count); } static void cpu_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void cpu_clock_event_del(struct perf_event *event, int flags) { cpu_clock_event_stop(event, flags); } static void cpu_clock_event_read(struct perf_event *event) { cpu_clock_event_update(event); } static int cpu_clock_event_init(struct perf_event *event) { if (event->attr.type != perf_cpu_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .dev = PMU_NULL_DEV, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, }; /* * Software event: task time clock */ static void task_clock_event_update(struct perf_event *event, u64 now) { u64 prev; s64 delta; prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; local64_add(delta, &event->count); } static void task_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void task_clock_event_del(struct perf_event *event, int flags) { task_clock_event_stop(event, PERF_EF_UPDATE); } static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); u64 delta = now - event->ctx->timestamp; u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } static int task_clock_event_init(struct perf_event *event) { if (event->attr.type != perf_task_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .dev = PMU_NULL_DEV, .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, }; static void perf_pmu_nop_void(struct pmu *pmu) { } static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) { } static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } static int perf_event_nop_int(struct perf_event *event, u64 value) { return 0; } static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { __this_cpu_write(nop_txn_flags, flags); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return 0; perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_enable(pmu); } static int perf_event_idx_default(struct perf_event *event) { return 0; } static void free_pmu_context(struct pmu *pmu) { free_percpu(pmu->cpu_pmu_context); } /* * Let userspace know that this PMU supports address range filtering: */ static ssize_t nr_addr_filters_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); static struct idr pmu_idr; static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct pmu *pmu = dev_get_drvdata(dev); int timer, cpu, ret; ret = kstrtoint(buf, 0, &timer); if (ret) return ret; if (timer < 1) return -EINVAL; /* same value, noting to do */ if (timer == pmu->hrtimer_interval_ms) return count; mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) { switch (scope) { case PERF_PMU_SCOPE_CORE: return topology_sibling_cpumask(cpu); case PERF_PMU_SCOPE_DIE: return topology_die_cpumask(cpu); case PERF_PMU_SCOPE_CLUSTER: return topology_cluster_cpumask(cpu); case PERF_PMU_SCOPE_PKG: return topology_core_cpumask(cpu); case PERF_PMU_SCOPE_SYS_WIDE: return cpu_online_mask; } return NULL; } static inline struct cpumask *perf_scope_cpumask(unsigned int scope) { switch (scope) { case PERF_PMU_SCOPE_CORE: return perf_online_core_mask; case PERF_PMU_SCOPE_DIE: return perf_online_die_mask; case PERF_PMU_SCOPE_CLUSTER: return perf_online_cluster_mask; case PERF_PMU_SCOPE_PKG: return perf_online_pkg_mask; case PERF_PMU_SCOPE_SYS_WIDE: return perf_online_sys_mask; } return NULL; } static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pmu *pmu = dev_get_drvdata(dev); struct cpumask *mask = perf_scope_cpumask(pmu->scope); if (mask) return cpumap_print_to_pagebuf(true, buf, mask); return 0; } static DEVICE_ATTR_RO(cpumask); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_nr_addr_filters.attr, &dev_attr_cpumask.attr, NULL, }; static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct pmu *pmu = dev_get_drvdata(dev); if (n == 2 && !pmu->nr_addr_filters) return 0; /* cpumask */ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) return 0; return a->mode; } static struct attribute_group pmu_dev_attr_group = { .is_visible = pmu_dev_is_visible, .attrs = pmu_dev_attrs, }; static const struct attribute_group *pmu_dev_groups[] = { &pmu_dev_attr_group, NULL, }; static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; static void pmu_dev_release(struct device *dev) { kfree(dev); } static int pmu_dev_alloc(struct pmu *pmu) { int ret = -ENOMEM; pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!pmu->dev) goto out; pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; pmu->dev->parent = pmu->parent; pmu->dev->release = pmu_dev_release; ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) goto free_dev; ret = device_add(pmu->dev); if (ret) goto free_dev; if (pmu->attr_update) { ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); if (ret) goto del_dev; } out: return ret; del_dev: device_del(pmu->dev); free_dev: put_device(pmu->dev); goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret, max = PERF_TYPE_MAX; mutex_lock(&pmus_lock); ret = -ENOMEM; pmu->pmu_disable_count = alloc_percpu(int); if (!pmu->pmu_disable_count) goto unlock; pmu->type = -1; if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { ret = -EINVAL; goto free_pdc; } if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { ret = -EINVAL; goto free_pdc; } pmu->name = name; if (type >= 0) max = type; ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); if (ret < 0) goto free_pdc; WARN_ON(type >= 0 && ret != type); type = ret; pmu->type = type; if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); if (ret) goto free_idr; } ret = -ENOMEM; pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); if (!pmu->cpu_pmu_context) goto free_dev; for_each_possible_cpu(cpu) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } if (!pmu->start_txn) { if (pmu->pmu_enable) { /* * If we have pmu_enable/pmu_disable calls, install * transaction stubs that use that to try and batch * hardware accesses. */ pmu->start_txn = perf_pmu_start_txn; pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } } if (!pmu->pmu_enable) { pmu->pmu_enable = perf_pmu_nop_void; pmu->pmu_disable = perf_pmu_nop_void; } if (!pmu->check_period) pmu->check_period = perf_event_nop_int; if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; list_add_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; free_dev: if (pmu->dev && pmu->dev != PMU_NULL_DEV) { device_del(pmu->dev); put_device(pmu->dev); } free_idr: idr_remove(&pmu_idr, pmu->type); free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); free_percpu(pmu->pmu_disable_count); idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); static inline bool has_extended_regs(struct perf_event *event) { return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); } static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; int ret; if (!try_module_get(pmu->module)) return -ENODEV; /* * A number of pmu->event_init() methods iterate the sibling_list to, * for example, validate if the group fits on the PMU. Therefore, * if this is a sibling event, acquire the ctx->mutex to protect * the sibling_list. */ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. */ ctx = perf_event_ctx_lock_nested(event->group_leader, SINGLE_DEPTH_NESTING); BUG_ON(!ctx); } event->pmu = pmu; ret = pmu->event_init(event); if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); if (!ret) { if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && has_extended_regs(event)) ret = -EOPNOTSUPP; if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && event_has_any_exclude_flag(event)) ret = -EINVAL; if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); int cpu; if (pmu_cpumask && cpumask) { cpu = cpumask_any_and(pmu_cpumask, cpumask); if (cpu >= nr_cpu_ids) ret = -ENODEV; else event->event_caps |= PERF_EV_CAP_READ_SCOPE; } else { ret = -ENODEV; } } if (ret && event->destroy) event->destroy(event); } if (ret) module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; int idx, type, ret; struct pmu *pmu; idx = srcu_read_lock(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain * pmus overwrites event->attr.type to forward event to another pmu. */ event->orig_type = event->attr.type; /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; } /* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW. */ type = event->attr.type; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { type = event->attr.config >> PERF_PMU_TYPE_SHIFT; if (!type) { type = PERF_TYPE_RAW; } else { extended_type = true; event->attr.config &= PERF_HW_EVENT_MASK; } } again: rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) goto fail; ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { type = event->attr.type; goto again; } if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; if (ret != -ENOENT) { pmu = ERR_PTR(ret); goto unlock; } } fail: pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); return pmu; } static void attach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_add_rcu(&event->sb_list, &pel->list); raw_spin_unlock(&pel->lock); } /* * We keep a list of all !task (and therefore per-cpu) events * that need to receive side-band records. * * This avoids having to scan all the various PMU per-cpu contexts * looking for them. */ static void account_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) attach_sb_event(event); } /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL /* Lock so we don't race with concurrent unaccount */ spin_lock(&nr_freq_lock); if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void account_freq_event(void) { if (tick_nohz_full_enabled()) account_freq_event_nohz(); else atomic_inc(&nr_freq_events); } static void account_event(struct perf_event *event) { bool inc = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.build_id) atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); if (event->attr.cgroup) atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); inc = true; } if (has_branch_stack(event)) inc = true; if (is_cgroup_event(event)) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); if (event->attr.text_poke) atomic_inc(&nr_text_poke_events); if (inc) { /* * We need the mutex here because static_branch_enable() * must complete *before* the perf_sched_count increment * becomes visible. */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; mutex_lock(&perf_sched_mutex); if (!atomic_read(&perf_sched_count)) { static_branch_enable(&perf_sched_events); /* * Guarantee that all CPUs observe they key change and * call the perf scheduling hooks before proceeding to * install events that need them. */ synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further * increments to by-pass the mutex. */ atomic_inc(&perf_sched_count); mutex_unlock(&perf_sched_mutex); } enabled: account_pmu_sb_event(event); } /* * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } if (attr->sigtrap && !task) { /* Requires a task: avoid signalling random tasks. */ return ERR_PTR(-EINVAL); } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); /* * Single events are their own group leaders, with an * empty sibling list: */ if (!group_leader) group_leader = event; mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->active_list); init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; event->oncpu = -1; event->parent = parent_event; event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; if (parent_event) event->event_caps = parent_event->event_caps; if (task) { event->attach_state = PERF_ATTACH_TASK; /* * XXX pmu::event_init needs to know what task to account to * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ event->hw.target = get_task_struct(task); } event->clock = &local_clock; if (parent_event) event->clock = parent_event->clock; if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (parent_event->prog) { struct bpf_prog *prog = parent_event->prog; bpf_prog_inc(prog); event->prog = prog; } #endif } if (overflow_handler) { event->overflow_handler = overflow_handler; event->overflow_handler_context = context; } else if (is_write_backward(event)){ event->overflow_handler = perf_event_output_backward; event->overflow_handler_context = NULL; } else { event->overflow_handler = perf_event_output_forward; event->overflow_handler_context = NULL; } perf_event__state_init(event); pmu = NULL; hwc = &event->hw; hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); /* * We do not support PERF_SAMPLE_READ on inherited events unless * PERF_SAMPLE_TID is also selected, which allows inherited events to * collect per-thread samples. * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) goto err_ns; if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } /* * Disallow uncore-task events. Similarly, disallow uncore-cgroup * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; goto err_pmu; } if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || event->attr.aux_pause || event->attr.aux_resume)) { err = -EOPNOTSUPP; goto err_pmu; } if (event->attr.aux_pause && event->attr.aux_resume) { err = -EINVAL; goto err_pmu; } if (event->attr.aux_start_paused) { if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { err = -EOPNOTSUPP; goto err_pmu; } event->hw.aux_paused = 1; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) goto err_pmu; } err = exclusive_event_init(event); if (err) goto err_pmu; if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } /* * Clone the parent's vma offsets: they are valid until exec() * even if the mm is not shared with the parent. */ if (event->parent) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); raw_spin_lock_irq(&ifh->lock); memcpy(event->addr_filter_ranges, event->parent->addr_filter_ranges, pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); raw_spin_unlock_irq(&ifh->lock); } /* force hw sync on the address filters */ event->addr_filters_gen = 1; } if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } } err = security_perf_event_alloc(event); if (err) goto err_callchain_buffer; /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; err_callchain_buffer: if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } err_addr_filters: kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); err_pmu: if (is_cgroup_event(event)) perf_detach_cgroup(event); if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: if (event->hw.target) put_task_struct(event->hw.target); call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr) { u32 size; int ret; /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; /* ABI compatibility quirk: */ if (!size) size = PERF_ATTR_SIZE_VER0; if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); if (ret) { if (ret == -E2BIG) goto err_size; return ret; } attr->size = size; if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) return -EINVAL; if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { u64 mask = attr->branch_sample_type; /* only using defined bits */ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) return -EINVAL; /* at least one branch bit must be set */ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { /* exclude_kernel checked on syscall entry */ if (!attr->exclude_kernel) mask |= PERF_SAMPLE_BRANCH_KERNEL; if (!attr->exclude_user) mask |= PERF_SAMPLE_BRANCH_USER; if (!attr->exclude_hv) mask |= PERF_SAMPLE_BRANCH_HV; /* * adjust user setting (for HW filter setup) */ attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { ret = perf_allow_kernel(attr); if (ret) return ret; } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { ret = perf_reg_validate(attr->sample_regs_user); if (ret) return ret; } if (attr->sample_type & PERF_SAMPLE_STACK_USER) { if (!arch_perf_have_user_stack_dump()) return -ENOSYS; /* * We have __u32 type for the size, but so far * we can only use __u16 as maximum due to the * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) return -EINVAL; } if (!attr->sample_max_stack) attr->sample_max_stack = sysctl_perf_event_max_stack; if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); #ifndef CONFIG_CGROUP_PERF if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) return -EINVAL; if (!attr->inherit && attr->inherit_thread) return -EINVAL; if (attr->remove_on_exec && attr->enable_on_exec) return -EINVAL; if (attr->sigtrap && !attr->remove_on_exec) return -EINVAL; out: return ret; err_size: put_user(sizeof(*attr), &uattr->size); ret = -E2BIG; goto out; } static void mutex_lock_double(struct mutex *a, struct mutex *b) { if (b < a) swap(a, b); mutex_lock(a); mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) { mutex_lock(&event->mmap_mutex); goto set; } /* don't allow circular references */ if (event == output_event) goto out; /* * Don't allow cross-cpu buffers */ if (output_event->cpu != event->cpu) goto out; /* * If its not a per-cpu rb, it must be the same task. */ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* * Mixing clocks in the same buffer is trouble you don't need. */ if (output_event->clock != event->clock) goto out; /* * Either writing ring buffer from beginning or from end. * Mixing is not allowed. */ if (is_write_backward(output_event) != is_write_backward(event)) goto out; /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && event->pmu != output_event->pmu) goto out; /* * Hold both mmap_mutex to serialize against perf_mmap_close(). Since * output_event is already on rb->event_list, and the list iteration * restarts after every removal, it is guaranteed this new event is * observed *OR* if output_event is already removed, it's guaranteed we * observe !rb->mmap_count. */ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) goto unlock; /* did we race against perf_mmap_close() */ if (!atomic_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } } ring_buffer_attach(event, rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); if (output_event) mutex_unlock(&output_event->mmap_mutex); out: return ret; } static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; switch (clk_id) { case CLOCK_MONOTONIC: event->clock = &ktime_get_mono_fast_ns; nmi_safe = true; break; case CLOCK_MONOTONIC_RAW: event->clock = &ktime_get_raw_fast_ns; nmi_safe = true; break; case CLOCK_REALTIME: event->clock = &ktime_get_real_ns; break; case CLOCK_BOOTTIME: event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: event->clock = &ktime_get_clocktai_ns; break; default: return -EINVAL; } if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) return -EINVAL; return 0; } static bool perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; bool is_capable = perfmon_capable(); if (attr->sigtrap) { /* * perf_event_attr::sigtrap sends signals to the other task. * Require the current task to also have CAP_KILL. */ rcu_read_lock(); is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); rcu_read_unlock(); /* * If the required capabilities aren't available, checks for * ptrace permissions: upgrade to ATTACH, since sending signals * can effectively change the target task. */ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; } /* * Preserve ptrace permission check for backwards compatibility. The * ptrace check also includes checks that the current task and other * task have matching uids, and is therefore not done here explicitly. */ return is_capable || ptrace_may_access(task, ptrace_mode); } /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * * @attr_uptr: event_id type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd * @flags: perf event open flags */ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; int err; int f_flags = O_RDWR; int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); if (err) return err; /* Do we allow access to perf_event_open(2) ? */ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { err = perf_allow_kernel(&attr); if (err) return err; } if (attr.namespaces) { if (!perfmon_capable()) return -EACCES; } if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; } else { if (attr.sample_period & (1ULL << 63)) return -EINVAL; } /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { err = perf_allow_kernel(&attr); if (err) return err; } /* REGS_INTR can leak data, lockdown must prevent this */ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { err = security_locked_down(LOCKDOWN_PERF); if (err) return err; } /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument * designates the cpu on which to monitor threads from that * cgroup. */ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; if (flags & PERF_FLAG_FD_CLOEXEC) f_flags |= O_CLOEXEC; event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd; CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { if (!is_perf_file(group)) { err = -EBADF; goto err_fd; } group_leader = fd_file(group)->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) group_leader = NULL; } if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); goto err_fd; } } if (task && group_leader && group_leader->attr.inherit != attr.inherit) { err = -EINVAL; goto err_task; } if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; } if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -EOPNOTSUPP; goto err_alloc; } } /* * Special case software events and allow them to be part of * any hardware group. */ pmu = event->pmu; if (attr.use_clockid) { err = perf_event_set_clock(event, attr.clockid); if (err) goto err_alloc; } if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; if (task) { err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_alloc; /* * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). */ err = -EACCES; if (!perf_check_permission(&attr, task)) goto err_cred; } /* * Get the target context (task or percpu): */ ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_cred; } mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); if (!cpuctx->online) { err = -ENODEV; goto err_locked; } } if (group_leader) { err = -EINVAL; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) goto err_locked; /* * Make sure we're both events for the same CPU; * grouping events for different CPUs is broken; since * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) goto err_locked; /* * Make sure we're both on the same context; either task or cpu. */ if (group_leader->ctx != ctx) goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) goto err_locked; if (is_software_event(event) && !in_software_context(group_leader)) { /* * If the event is a sw event, but the group_leader * is on hw context. * * Allow the addition of software events to hw * groups, this is safe because software events * never fail to schedule. * * Note the comment that goes with struct * perf_event_pmu_context. */ pmu = group_leader->pmu_ctx->pmu; } else if (!is_software_event(event)) { if (is_software_event(group_leader) && (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to * the hardware context. */ move_group = 1; } /* Don't allow group of multiple hw events from different pmus */ if (!in_software_context(group_leader) && group_leader->pmu_ctx->pmu != pmu) goto err_locked; } } /* * Now that we're certain of the pmu; find the pmu_ctx. */ pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); goto err_locked; } event->pmu_ctx = pmu_ctx; if (output_event) { err = perf_event_set_output(event, output_event); if (err) goto err_context; } if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_context; } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; goto err_context; } /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); event_file = NULL; goto err_context; } /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { perf_remove_from_context(group_leader, 0); put_pmu_ctx(group_leader->pmu_ctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_pmu_ctx(sibling->pmu_ctx); } /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group * (through the sibling list, which is still in-tact), we can * end up with siblings installed in the wrong context. * * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { sibling->pmu_ctx = pmu_ctx; get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); } /* * Removing from the context ends up with disabled * event. What we want here is event in the initial * startup state, ready to be add into new context. */ group_leader->pmu_ctx = pmu_ctx; get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); } /* * Precalculate sample_data sizes; do while holding ctx::mutex such * that we're serialized against further additions and before * perf_install_in_context() which is the point the event is active and * can use these values. */ perf_event__header_size(event); perf_event__id_header_size(event); event->owner = current; perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); if (task) { up_read(&task->signal->exec_update_lock); put_task_struct(task); } mutex_lock(&current->perf_event_mutex); list_add_tail(&event->owner_entry, &current->perf_event_list); mutex_unlock(&current->perf_event_mutex); /* * File reference in group guarantees that group_leader has been * kept alive until we place the new event on the sibling_list. * This ensures destruction of the group leader will find * the pointer to itself in perf_group_detach(). */ fd_install(event_fd, event_file); return event_fd; err_context: put_pmu_ctx(event->pmu_ctx); event->pmu_ctx = NULL; /* _free_event() */ err_locked: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: free_event(event); err_task: if (task) put_task_struct(task); err_fd: put_unused_fd(event_fd); return err; } /** * perf_event_create_kernel_counter * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) * @overflow_handler: callback to trigger when we hit the event * @context: context data could be used in overflow_handler callback */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t overflow_handler, void *context) { struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; struct pmu *pmu; int err; /* * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; } /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; pmu = event->pmu; if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; /* * Get the target context (task or percpu): */ ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_unlock; } pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); goto err_unlock; } event->pmu_ctx = pmu_ctx; if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_pmu_ctx; } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; err_pmu_ctx: put_pmu_ctx(pmu_ctx); event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); static void __perf_pmu_remove(struct perf_event_context *ctx, int cpu, struct pmu *pmu, struct perf_event_groups *groups, struct list_head *events) { struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); put_pmu_ctx(event->pmu_ctx); list_add(&event->migrate_entry, events); for_each_sibling_event(sibling, event) { perf_remove_from_context(sibling, 0); put_pmu_ctx(sibling->pmu_ctx); list_add(&sibling->migrate_entry, events); } } } static void __perf_pmu_install_event(struct pmu *pmu, struct perf_event_context *ctx, int cpu, struct perf_event *event) { struct perf_event_pmu_context *epc; struct perf_event_context *old_ctx = event->ctx; get_ctx(ctx); /* normally find_get_context() */ event->cpu = cpu; epc = find_get_pmu_context(pmu, ctx, event); event->pmu_ctx = epc; if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; perf_install_in_context(ctx, event, cpu); /* * Now that event->ctx is updated and visible, put the old ctx. */ put_ctx(old_ctx); } static void __perf_pmu_install(struct perf_event_context *ctx, int cpu, struct pmu *pmu, struct list_head *events) { struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. * * Skip over group leaders and only install siblings on this first * pass, siblings will not get enabled without a leader, however a * leader will enable its siblings, even if those are still on the old * context. */ list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); __perf_pmu_install_event(pmu, ctx, cpu, event); } } void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) { struct perf_event_context *src_ctx, *dst_ctx; LIST_HEAD(events); /* * Since per-cpu context is persistent, no need to grab an extra * reference. */ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); if (!list_empty(&events)) { /* * Wait for the events to quiesce before re-instating them. */ synchronize_rcu(); __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); } mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); static void sync_child_event(struct perf_event *child_event) { struct perf_event *parent_event = child_event->parent; u64 child_val; if (child_event->attr.inherit_stat) { struct task_struct *task = child_event->ctx->task; if (task && task != TASK_TOMBSTONE) perf_event_read_event(child_event, task); } child_val = perf_event_count(child_event, false); /* * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); } static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent_event = event->parent; unsigned long detach_flags = 0; if (parent_event) { /* * Do not destroy the 'original' grouping; because of the * context switch optimization the original events could've * ended up in a random child task. * * If we were to destroy the original group, all group related * operations would cease to function properly after this * random child dies. * * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ detach_flags = DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); } perf_remove_from_context(event, detach_flags); raw_spin_lock_irq(&ctx->lock); if (event->state > PERF_EVENT_STATE_EXIT) perf_event_set_state(event, PERF_EVENT_STATE_EXIT); raw_spin_unlock_irq(&ctx->lock); /* * Child events can be freed. */ if (parent_event) { mutex_unlock(&parent_event->child_mutex); /* * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); free_event(event); put_event(parent_event); return; } /* * Parent events are governed by their filedesc, retain them. */ perf_event_wakeup(event); } static void perf_event_exit_task_context(struct task_struct *child) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); child_ctx = perf_pin_task_context(child); if (!child_ctx) return; /* * In order to reduce the amount of tricky in ctx tear-down, we hold * ctx::mutex over the entire thing. This serializes against almost * everything that wants to access the ctx. * * The exception is sys_perf_event_open() / * perf_event_create_kernel_count() which does find_get_context() * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ mutex_lock(&child_ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ raw_spin_lock_irq(&child_ctx->lock); task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ RCU_INIT_POINTER(child->perf_event_ctxp, NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ clone_ctx = unclone_ctx(child_ctx); raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); /* * Report the task dead after unscheduling the events so that we * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) perf_event_exit_event(child_event, child_ctx); mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); } /* * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); /* * Ensure the list deletion is visible before we clear * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); perf_event_exit_task_context(child); /* * The perf_event_exit_task_context calls perf_event_task * with child's task_ctx, which generates EXIT events for * child contexts and sets child->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent = event->parent; if (WARN_ON_ONCE(!parent)) return; mutex_lock(&parent->child_mutex); list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); put_event(parent); raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* * Free a context as created by inheritance by perf_event_init_task() below, * used by fork() in case of fail. * * Even though the task has never lived, the context and events have been * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; ctx = rcu_access_pointer(task->perf_event_ctxp); if (!ctx) return; mutex_lock(&ctx->mutex); raw_spin_lock_irq(&ctx->lock); /* * Destroy the task <-> ctx relation and mark the context dead. * * This is important because even though the task hasn't been * exposed yet the context has been (through child_list). */ RCU_INIT_POINTER(task->perf_event_ctxp, NULL); WRITE_ONCE(ctx->task, TASK_TOMBSTONE); put_task_struct(task); /* cannot be last */ raw_spin_unlock_irq(&ctx->lock); list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); /* * perf_event_release_kernel() could've stolen some of our * child events and still have them on its free_list. In that * case we must wait for these events to have been freed (in * particular all their references to this task must've been * dropped). * * Without this copy_process() will unconditionally free this * task (irrespective of its reference count) and * _free_event()'s put_task_struct(event->hw.target) will be a * use-after-free. * * Wait for all events to drop their context reference. */ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); put_ctx(ctx); /* must be last */ } void perf_event_delayed_put(struct task_struct *task) { WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) { struct file *file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &perf_fops) { fput(file); return ERR_PTR(-EBADF); } return file; } const struct perf_event *perf_get_event(struct file *file) { if (file->f_op != &perf_fops) return ERR_PTR(-EINVAL); return file->private_data; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) return ERR_PTR(-EINVAL); return &event->attr; } int perf_allow_kernel(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); } EXPORT_SYMBOL_GPL(perf_allow_kernel); /* * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success * - NULL for orphaned events * - IS_ERR() on error */ static struct perf_event * inherit_event(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event *group_leader, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; /* * Instead of creating recursive hierarchies of events, * we link inherited events back to the original parent, * which has a filp for sure, which we use as the reference * count: */ if (parent_event->parent) parent_event = parent_event->parent; child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, group_leader, parent_event, NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against * perf_event_release_kernel(), such that either we must observe * is_orphaned_event() or they will observe us on the child_list. */ mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } get_ctx(child_ctx); /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family. */ if (parent_state >= PERF_EVENT_STATE_INACTIVE) child_event->state = PERF_EVENT_STATE_INACTIVE; else child_event->state = PERF_EVENT_STATE_OFF; if (parent_event->attr.freq) { u64 sample_period = parent_event->hw.sample_period; struct hw_perf_event *hwc = &child_event->hw; hwc->sample_period = sample_period; hwc->last_period = sample_period; local64_set(&hwc->period_left, sample_period); } child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; /* * Precalculate sample_data sizes */ perf_event__header_size(child_event); perf_event__id_header_size(child_event); /* * Link it up in the child's context: */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); child_event->attach_state |= PERF_ATTACH_CHILD; raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Link this into the parent event's child list */ list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); return child_event; } /* * Inherits an event group. * * This will quietly suppress orphaned events; !inherit_event() is not an error. * This matches with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event_context *child_ctx) { struct perf_event *leader; struct perf_event *sub; struct perf_event *child_ctr; leader = inherit_event(parent_event, parent, parent_ctx, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); /* * @leader can be NULL here because of is_orphaned_event(). In this * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } if (leader) leader->group_generation = parent_event->group_generation; return 0; } /* * Creates the child task context and tries to inherit the event-group. * * Clears @inherited_all on !attr.inherited or error. Note that we'll leave * inherited_all set when we 'fail' to inherit an orphaned event; this is * consistent with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, u64 clone_flags, int *inherited_all) { struct perf_event_context *child_ctx; int ret; if (!event->attr.inherit || (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || /* Do not inherit if sigtrap and signal handlers were cleared. */ (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; } child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so * inherit events that have been marked for cloning. * First allocate and initialize a context for the * child. */ child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; child->perf_event_ctxp = child_ctx; } ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; return ret; } /* * Initialize the perf_event context in task_struct */ static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; unsigned long flags; int ret = 0; if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; /* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL * is if we exit, and since we're currently in the middle of * a fork we can't be exiting at the same time. */ /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. */ mutex_lock(&parent_ctx->mutex); /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, clone_flags, &inherited_all); if (ret) goto out_unlock; } /* * We can't hold ctx->lock when iterating the ->flexible_group list due * to allocations, but we need to prevent rotation because * rotate_ctx() will change the list from interrupt context. */ raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, clone_flags, &inherited_all); if (ret) goto out_unlock; } raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. * * Note that if the parent is a clone, the holding of * parent_ctx->lock avoids it from being uncloned. */ cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; } else { child_ctx->parent_ctx = parent_ctx; child_ctx->parent_gen = parent_ctx->generation; } get_ctx(child_ctx->parent_ctx); } raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); out_unlock: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); put_ctx(parent_ctx); return ret; } /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ret; memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); ret = perf_event_init_context(child, clone_flags); if (ret) { perf_event_free_task(child); return ret; } return 0; } static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); cpuctx->heap = cpuctx->heap_default; } } static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); WARN_ON(!hlist); rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; struct perf_event *event; raw_spin_lock(&ctx->lock); ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } static void perf_event_clear_cpumask(unsigned int cpu) { int target[PERF_PMU_MAX_SCOPE]; unsigned int scope; struct pmu *pmu; cpumask_clear_cpu(cpu, perf_online_mask); for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); target[scope] = -1; if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) continue; if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) continue; target[scope] = cpumask_any_but(cpumask, cpu); if (target[scope] < nr_cpu_ids) cpumask_set_cpu(target[scope], pmu_cpumask); } /* migrate */ list_for_each_entry(pmu, &pmus, entry) { if (pmu->scope == PERF_PMU_SCOPE_NONE || WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) continue; if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); } } static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; // XXX simplify cpuctx->online mutex_lock(&pmus_lock); /* * Clear the cpumasks, and migrate to other CPUs if possible. * Must be invoked before the __perf_event_exit_context. */ perf_event_clear_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); } #else static void perf_event_exit_cpu_context(int cpu) { } #endif static void perf_event_setup_cpumask(unsigned int cpu) { struct cpumask *pmu_cpumask; unsigned int scope; /* * Early boot stage, the cpumask hasn't been set yet. * The perf_online_<domain>_masks includes the first CPU of each domain. * Always unconditionally set the boot CPU for the perf_online_<domain>_masks. */ if (cpumask_empty(perf_online_mask)) { for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { pmu_cpumask = perf_scope_cpumask(scope); if (WARN_ON_ONCE(!pmu_cpumask)) continue; cpumask_set_cpu(cpu, pmu_cpumask); } goto end; } for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); pmu_cpumask = perf_scope_cpumask(scope); if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) continue; if (!cpumask_empty(cpumask) && cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) cpumask_set_cpu(cpu, pmu_cpumask); } end: cpumask_set_cpu(cpu, perf_online_mask); } int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); perf_event_setup_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); cpuctx->online = 1; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; } int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); return 0; } static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) { int cpu; for_each_online_cpu(cpu) perf_event_exit_cpu(cpu); return NOTIFY_OK; } /* * Run the perf reboot notifier at the very last possible moment so that * the generic watchdog code runs as long as possible. */ static struct notifier_block perf_reboot_notifier = { .notifier_call = perf_reboot, .priority = INT_MIN, }; void __init perf_event_init(void) { int ret; idr_init(&pmu_idr); perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); perf_pmu_register(&perf_task_clock, "task_clock", -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. */ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) != 1024); } ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return 0; } EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { struct pmu *pmu; int ret; mutex_lock(&pmus_lock); ret = bus_register(&pmu_bus); if (ret) goto unlock; list_for_each_entry(pmu, &pmus, entry) { if (pmu->dev) continue; ret = pmu_dev_alloc(pmu); WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); } pmu_bus_running = 1; ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; } device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF static struct cgroup_subsys_state * perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; jc = kzalloc(sizeof(*jc), GFP_KERNEL); if (!jc) return ERR_PTR(-ENOMEM); jc->info = alloc_percpu(struct perf_cgroup_info); if (!jc->info) { kfree(jc); return ERR_PTR(-ENOMEM); } return &jc->css; } static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); free_percpu(jc->info); kfree(jc); } static int perf_cgroup_css_online(struct cgroup_subsys_state *css) { perf_event_cgroup(css->cgroup); return 0; } static int __perf_cgroup_move(void *info) { struct task_struct *task = info; preempt_disable(); perf_cgroup_switch(task); preempt_enable(); return 0; } static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can * always be filtered by cgroup2 path as long as perf_event * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "netlink.h" #include "device.h" #include "peer.h" #include "socket.h" #include "queueing.h" #include "messages.h" #include <uapi/linux/wireguard.h> #include <linux/if.h> #include <net/genetlink.h> #include <net/sock.h> #include <crypto/utils.h> static struct genl_family genl_family; static const struct nla_policy device_policy[WGDEVICE_A_MAX + 1] = { [WGDEVICE_A_IFINDEX] = { .type = NLA_U32 }, [WGDEVICE_A_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [WGDEVICE_A_PRIVATE_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGDEVICE_A_PUBLIC_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGDEVICE_A_FLAGS] = { .type = NLA_U32 }, [WGDEVICE_A_LISTEN_PORT] = { .type = NLA_U16 }, [WGDEVICE_A_FWMARK] = { .type = NLA_U32 }, [WGDEVICE_A_PEERS] = { .type = NLA_NESTED } }; static const struct nla_policy peer_policy[WGPEER_A_MAX + 1] = { [WGPEER_A_PUBLIC_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGPEER_A_PRESHARED_KEY] = NLA_POLICY_EXACT_LEN(NOISE_SYMMETRIC_KEY_LEN), [WGPEER_A_FLAGS] = { .type = NLA_U32 }, [WGPEER_A_ENDPOINT] = NLA_POLICY_MIN_LEN(sizeof(struct sockaddr)), [WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL] = { .type = NLA_U16 }, [WGPEER_A_LAST_HANDSHAKE_TIME] = NLA_POLICY_EXACT_LEN(sizeof(struct __kernel_timespec)), [WGPEER_A_RX_BYTES] = { .type = NLA_U64 }, [WGPEER_A_TX_BYTES] = { .type = NLA_U64 }, [WGPEER_A_ALLOWEDIPS] = { .type = NLA_NESTED }, [WGPEER_A_PROTOCOL_VERSION] = { .type = NLA_U32 } }; static const struct nla_policy allowedip_policy[WGALLOWEDIP_A_MAX + 1] = { [WGALLOWEDIP_A_FAMILY] = { .type = NLA_U16 }, [WGALLOWEDIP_A_IPADDR] = NLA_POLICY_MIN_LEN(sizeof(struct in_addr)), [WGALLOWEDIP_A_CIDR_MASK] = { .type = NLA_U8 } }; static struct wg_device *lookup_interface(struct nlattr **attrs, struct sk_buff *skb) { struct net_device *dev = NULL; if (!attrs[WGDEVICE_A_IFINDEX] == !attrs[WGDEVICE_A_IFNAME]) return ERR_PTR(-EBADR); if (attrs[WGDEVICE_A_IFINDEX]) dev = dev_get_by_index(sock_net(skb->sk), nla_get_u32(attrs[WGDEVICE_A_IFINDEX])); else if (attrs[WGDEVICE_A_IFNAME]) dev = dev_get_by_name(sock_net(skb->sk), nla_data(attrs[WGDEVICE_A_IFNAME])); if (!dev) return ERR_PTR(-ENODEV); if (!dev->rtnl_link_ops || !dev->rtnl_link_ops->kind || strcmp(dev->rtnl_link_ops->kind, KBUILD_MODNAME)) { dev_put(dev); return ERR_PTR(-EOPNOTSUPP); } return netdev_priv(dev); } static int get_allowedips(struct sk_buff *skb, const u8 *ip, u8 cidr, int family) { struct nlattr *allowedip_nest; allowedip_nest = nla_nest_start(skb, 0); if (!allowedip_nest) return -EMSGSIZE; if (nla_put_u8(skb, WGALLOWEDIP_A_CIDR_MASK, cidr) || nla_put_u16(skb, WGALLOWEDIP_A_FAMILY, family) || nla_put(skb, WGALLOWEDIP_A_IPADDR, family == AF_INET6 ? sizeof(struct in6_addr) : sizeof(struct in_addr), ip)) { nla_nest_cancel(skb, allowedip_nest); return -EMSGSIZE; } nla_nest_end(skb, allowedip_nest); return 0; } struct dump_ctx { struct wg_device *wg; struct wg_peer *next_peer; u64 allowedips_seq; struct allowedips_node *next_allowedip; }; #define DUMP_CTX(cb) ((struct dump_ctx *)(cb)->args) static int get_peer(struct wg_peer *peer, struct sk_buff *skb, struct dump_ctx *ctx) { struct nlattr *allowedips_nest, *peer_nest = nla_nest_start(skb, 0); struct allowedips_node *allowedips_node = ctx->next_allowedip; bool fail; if (!peer_nest) return -EMSGSIZE; down_read(&peer->handshake.lock); fail = nla_put(skb, WGPEER_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN, peer->handshake.remote_static); up_read(&peer->handshake.lock); if (fail) goto err; if (!allowedips_node) { const struct __kernel_timespec last_handshake = { .tv_sec = peer->walltime_last_handshake.tv_sec, .tv_nsec = peer->walltime_last_handshake.tv_nsec }; down_read(&peer->handshake.lock); fail = nla_put(skb, WGPEER_A_PRESHARED_KEY, NOISE_SYMMETRIC_KEY_LEN, peer->handshake.preshared_key); up_read(&peer->handshake.lock); if (fail) goto err; if (nla_put(skb, WGPEER_A_LAST_HANDSHAKE_TIME, sizeof(last_handshake), &last_handshake) || nla_put_u16(skb, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, peer->persistent_keepalive_interval) || nla_put_u64_64bit(skb, WGPEER_A_TX_BYTES, peer->tx_bytes, WGPEER_A_UNSPEC) || nla_put_u64_64bit(skb, WGPEER_A_RX_BYTES, peer->rx_bytes, WGPEER_A_UNSPEC) || nla_put_u32(skb, WGPEER_A_PROTOCOL_VERSION, 1)) goto err; read_lock_bh(&peer->endpoint_lock); if (peer->endpoint.addr.sa_family == AF_INET) fail = nla_put(skb, WGPEER_A_ENDPOINT, sizeof(peer->endpoint.addr4), &peer->endpoint.addr4); else if (peer->endpoint.addr.sa_family == AF_INET6) fail = nla_put(skb, WGPEER_A_ENDPOINT, sizeof(peer->endpoint.addr6), &peer->endpoint.addr6); read_unlock_bh(&peer->endpoint_lock); if (fail) goto err; allowedips_node = list_first_entry_or_null(&peer->allowedips_list, struct allowedips_node, peer_list); } if (!allowedips_node) goto no_allowedips; if (!ctx->allowedips_seq) ctx->allowedips_seq = ctx->wg->peer_allowedips.seq; else if (ctx->allowedips_seq != ctx->wg->peer_allowedips.seq) goto no_allowedips; allowedips_nest = nla_nest_start(skb, WGPEER_A_ALLOWEDIPS); if (!allowedips_nest) goto err; list_for_each_entry_from(allowedips_node, &peer->allowedips_list, peer_list) { u8 cidr, ip[16] __aligned(__alignof(u64)); int family; family = wg_allowedips_read_node(allowedips_node, ip, &cidr); if (get_allowedips(skb, ip, cidr, family)) { nla_nest_end(skb, allowedips_nest); nla_nest_end(skb, peer_nest); ctx->next_allowedip = allowedips_node; return -EMSGSIZE; } } nla_nest_end(skb, allowedips_nest); no_allowedips: nla_nest_end(skb, peer_nest); ctx->next_allowedip = NULL; ctx->allowedips_seq = 0; return 0; err: nla_nest_cancel(skb, peer_nest); return -EMSGSIZE; } static int wg_get_device_start(struct netlink_callback *cb) { struct wg_device *wg; wg = lookup_interface(genl_info_dump(cb)->attrs, cb->skb); if (IS_ERR(wg)) return PTR_ERR(wg); DUMP_CTX(cb)->wg = wg; return 0; } static int wg_get_device_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct wg_peer *peer, *next_peer_cursor; struct dump_ctx *ctx = DUMP_CTX(cb); struct wg_device *wg = ctx->wg; struct nlattr *peers_nest; int ret = -EMSGSIZE; bool done = true; void *hdr; rtnl_lock(); mutex_lock(&wg->device_update_lock); cb->seq = wg->device_update_gen; next_peer_cursor = ctx->next_peer; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &genl_family, NLM_F_MULTI, WG_CMD_GET_DEVICE); if (!hdr) goto out; genl_dump_check_consistent(cb, hdr); if (!ctx->next_peer) { if (nla_put_u16(skb, WGDEVICE_A_LISTEN_PORT, wg->incoming_port) || nla_put_u32(skb, WGDEVICE_A_FWMARK, wg->fwmark) || nla_put_u32(skb, WGDEVICE_A_IFINDEX, wg->dev->ifindex) || nla_put_string(skb, WGDEVICE_A_IFNAME, wg->dev->name)) goto out; down_read(&wg->static_identity.lock); if (wg->static_identity.has_identity) { if (nla_put(skb, WGDEVICE_A_PRIVATE_KEY, NOISE_PUBLIC_KEY_LEN, wg->static_identity.static_private) || nla_put(skb, WGDEVICE_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN, wg->static_identity.static_public)) { up_read(&wg->static_identity.lock); goto out; } } up_read(&wg->static_identity.lock); } peers_nest = nla_nest_start(skb, WGDEVICE_A_PEERS); if (!peers_nest) goto out; ret = 0; lockdep_assert_held(&wg->device_update_lock); /* If the last cursor was removed in peer_remove or peer_remove_all, then * we just treat this the same as there being no more peers left. The * reason is that seq_nr should indicate to userspace that this isn't a * coherent dump anyway, so they'll try again. */ if (list_empty(&wg->peer_list) || (ctx->next_peer && ctx->next_peer->is_dead)) { nla_nest_cancel(skb, peers_nest); goto out; } peer = list_prepare_entry(ctx->next_peer, &wg->peer_list, peer_list); list_for_each_entry_continue(peer, &wg->peer_list, peer_list) { if (get_peer(peer, skb, ctx)) { done = false; break; } next_peer_cursor = peer; } nla_nest_end(skb, peers_nest); out: if (!ret && !done && next_peer_cursor) wg_peer_get(next_peer_cursor); wg_peer_put(ctx->next_peer); mutex_unlock(&wg->device_update_lock); rtnl_unlock(); if (ret) { genlmsg_cancel(skb, hdr); return ret; } genlmsg_end(skb, hdr); if (done) { ctx->next_peer = NULL; return 0; } ctx->next_peer = next_peer_cursor; return skb->len; /* At this point, we can't really deal ourselves with safely zeroing out * the private key material after usage. This will need an additional API * in the kernel for marking skbs as zero_on_free. */ } static int wg_get_device_done(struct netlink_callback *cb) { struct dump_ctx *ctx = DUMP_CTX(cb); if (ctx->wg) dev_put(ctx->wg->dev); wg_peer_put(ctx->next_peer); return 0; } static int set_port(struct wg_device *wg, u16 port) { struct wg_peer *peer; if (wg->incoming_port == port) return 0; list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); if (!netif_running(wg->dev)) { wg->incoming_port = port; return 0; } return wg_socket_init(wg, port); } static int set_allowedip(struct wg_peer *peer, struct nlattr **attrs) { int ret = -EINVAL; u16 family; u8 cidr; if (!attrs[WGALLOWEDIP_A_FAMILY] || !attrs[WGALLOWEDIP_A_IPADDR] || !attrs[WGALLOWEDIP_A_CIDR_MASK]) return ret; family = nla_get_u16(attrs[WGALLOWEDIP_A_FAMILY]); cidr = nla_get_u8(attrs[WGALLOWEDIP_A_CIDR_MASK]); if (family == AF_INET && cidr <= 32 && nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in_addr)) ret = wg_allowedips_insert_v4( &peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); else if (family == AF_INET6 && cidr <= 128 && nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in6_addr)) ret = wg_allowedips_insert_v6( &peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); return ret; } static int set_peer(struct wg_device *wg, struct nlattr **attrs) { u8 *public_key = NULL, *preshared_key = NULL; struct wg_peer *peer = NULL; u32 flags = 0; int ret; ret = -EINVAL; if (attrs[WGPEER_A_PUBLIC_KEY] && nla_len(attrs[WGPEER_A_PUBLIC_KEY]) == NOISE_PUBLIC_KEY_LEN) public_key = nla_data(attrs[WGPEER_A_PUBLIC_KEY]); else goto out; if (attrs[WGPEER_A_PRESHARED_KEY] && nla_len(attrs[WGPEER_A_PRESHARED_KEY]) == NOISE_SYMMETRIC_KEY_LEN) preshared_key = nla_data(attrs[WGPEER_A_PRESHARED_KEY]); if (attrs[WGPEER_A_FLAGS]) flags = nla_get_u32(attrs[WGPEER_A_FLAGS]); ret = -EOPNOTSUPP; if (flags & ~__WGPEER_F_ALL) goto out; ret = -EPFNOSUPPORT; if (attrs[WGPEER_A_PROTOCOL_VERSION]) { if (nla_get_u32(attrs[WGPEER_A_PROTOCOL_VERSION]) != 1) goto out; } peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, nla_data(attrs[WGPEER_A_PUBLIC_KEY])); ret = 0; if (!peer) { /* Peer doesn't exist yet. Add a new one. */ if (flags & (WGPEER_F_REMOVE_ME | WGPEER_F_UPDATE_ONLY)) goto out; /* The peer is new, so there aren't allowed IPs to remove. */ flags &= ~WGPEER_F_REPLACE_ALLOWEDIPS; down_read(&wg->static_identity.lock); if (wg->static_identity.has_identity && !memcmp(nla_data(attrs[WGPEER_A_PUBLIC_KEY]), wg->static_identity.static_public, NOISE_PUBLIC_KEY_LEN)) { /* We silently ignore peers that have the same public * key as the device. The reason we do it silently is * that we'd like for people to be able to reuse the * same set of API calls across peers. */ up_read(&wg->static_identity.lock); ret = 0; goto out; } up_read(&wg->static_identity.lock); peer = wg_peer_create(wg, public_key, preshared_key); if (IS_ERR(peer)) { ret = PTR_ERR(peer); peer = NULL; goto out; } /* Take additional reference, as though we've just been * looked up. */ wg_peer_get(peer); } if (flags & WGPEER_F_REMOVE_ME) { wg_peer_remove(peer); goto out; } if (preshared_key) { down_write(&peer->handshake.lock); memcpy(&peer->handshake.preshared_key, preshared_key, NOISE_SYMMETRIC_KEY_LEN); up_write(&peer->handshake.lock); } if (attrs[WGPEER_A_ENDPOINT]) { struct sockaddr *addr = nla_data(attrs[WGPEER_A_ENDPOINT]); size_t len = nla_len(attrs[WGPEER_A_ENDPOINT]); struct endpoint endpoint = { { { 0 } } }; if (len == sizeof(struct sockaddr_in) && addr->sa_family == AF_INET) { endpoint.addr4 = *(struct sockaddr_in *)addr; wg_socket_set_peer_endpoint(peer, &endpoint); } else if (len == sizeof(struct sockaddr_in6) && addr->sa_family == AF_INET6) { endpoint.addr6 = *(struct sockaddr_in6 *)addr; wg_socket_set_peer_endpoint(peer, &endpoint); } } if (flags & WGPEER_F_REPLACE_ALLOWEDIPS) wg_allowedips_remove_by_peer(&wg->peer_allowedips, peer, &wg->device_update_lock); if (attrs[WGPEER_A_ALLOWEDIPS]) { struct nlattr *attr, *allowedip[WGALLOWEDIP_A_MAX + 1]; int rem; nla_for_each_nested(attr, attrs[WGPEER_A_ALLOWEDIPS], rem) { ret = nla_parse_nested(allowedip, WGALLOWEDIP_A_MAX, attr, allowedip_policy, NULL); if (ret < 0) goto out; ret = set_allowedip(peer, allowedip); if (ret < 0) goto out; } } if (attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]) { const u16 persistent_keepalive_interval = nla_get_u16( attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]); const bool send_keepalive = !peer->persistent_keepalive_interval && persistent_keepalive_interval && netif_running(wg->dev); peer->persistent_keepalive_interval = persistent_keepalive_interval; if (send_keepalive) wg_packet_send_keepalive(peer); } if (netif_running(wg->dev)) wg_packet_send_staged_packets(peer); out: wg_peer_put(peer); if (attrs[WGPEER_A_PRESHARED_KEY]) memzero_explicit(nla_data(attrs[WGPEER_A_PRESHARED_KEY]), nla_len(attrs[WGPEER_A_PRESHARED_KEY])); return ret; } static int wg_set_device(struct sk_buff *skb, struct genl_info *info) { struct wg_device *wg = lookup_interface(info->attrs, skb); u32 flags = 0; int ret; if (IS_ERR(wg)) { ret = PTR_ERR(wg); goto out_nodev; } rtnl_lock(); mutex_lock(&wg->device_update_lock); if (info->attrs[WGDEVICE_A_FLAGS]) flags = nla_get_u32(info->attrs[WGDEVICE_A_FLAGS]); ret = -EOPNOTSUPP; if (flags & ~__WGDEVICE_F_ALL) goto out; if (info->attrs[WGDEVICE_A_LISTEN_PORT] || info->attrs[WGDEVICE_A_FWMARK]) { struct net *net; rcu_read_lock(); net = rcu_dereference(wg->creating_net); ret = !net || !ns_capable(net->user_ns, CAP_NET_ADMIN) ? -EPERM : 0; rcu_read_unlock(); if (ret) goto out; } ++wg->device_update_gen; if (info->attrs[WGDEVICE_A_FWMARK]) { struct wg_peer *peer; wg->fwmark = nla_get_u32(info->attrs[WGDEVICE_A_FWMARK]); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); } if (info->attrs[WGDEVICE_A_LISTEN_PORT]) { ret = set_port(wg, nla_get_u16(info->attrs[WGDEVICE_A_LISTEN_PORT])); if (ret) goto out; } if (flags & WGDEVICE_F_REPLACE_PEERS) wg_peer_remove_all(wg); if (info->attrs[WGDEVICE_A_PRIVATE_KEY] && nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY]) == NOISE_PUBLIC_KEY_LEN) { u8 *private_key = nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]); u8 public_key[NOISE_PUBLIC_KEY_LEN]; struct wg_peer *peer, *temp; bool send_staged_packets; if (!crypto_memneq(wg->static_identity.static_private, private_key, NOISE_PUBLIC_KEY_LEN)) goto skip_set_private_key; /* We remove before setting, to prevent race, which means doing * two 25519-genpub ops. */ if (curve25519_generate_public(public_key, private_key)) { peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, public_key); if (peer) { wg_peer_put(peer); wg_peer_remove(peer); } } down_write(&wg->static_identity.lock); send_staged_packets = !wg->static_identity.has_identity && netif_running(wg->dev); wg_noise_set_static_identity_private_key(&wg->static_identity, private_key); send_staged_packets = send_staged_packets && wg->static_identity.has_identity; wg_cookie_checker_precompute_device_keys(&wg->cookie_checker); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { wg_noise_precompute_static_static(peer); wg_noise_expire_current_peer_keypairs(peer); if (send_staged_packets) wg_packet_send_staged_packets(peer); } up_write(&wg->static_identity.lock); } skip_set_private_key: if (info->attrs[WGDEVICE_A_PEERS]) { struct nlattr *attr, *peer[WGPEER_A_MAX + 1]; int rem; nla_for_each_nested(attr, info->attrs[WGDEVICE_A_PEERS], rem) { ret = nla_parse_nested(peer, WGPEER_A_MAX, attr, peer_policy, NULL); if (ret < 0) goto out; ret = set_peer(wg, peer); if (ret < 0) goto out; } } ret = 0; out: mutex_unlock(&wg->device_update_lock); rtnl_unlock(); dev_put(wg->dev); out_nodev: if (info->attrs[WGDEVICE_A_PRIVATE_KEY]) memzero_explicit(nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]), nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY])); return ret; } static const struct genl_ops genl_ops[] = { { .cmd = WG_CMD_GET_DEVICE, .start = wg_get_device_start, .dumpit = wg_get_device_dump, .done = wg_get_device_done, .flags = GENL_UNS_ADMIN_PERM }, { .cmd = WG_CMD_SET_DEVICE, .doit = wg_set_device, .flags = GENL_UNS_ADMIN_PERM } }; static struct genl_family genl_family __ro_after_init = { .ops = genl_ops, .n_ops = ARRAY_SIZE(genl_ops), .resv_start_op = WG_CMD_SET_DEVICE + 1, .name = WG_GENL_NAME, .version = WG_GENL_VERSION, .maxattr = WGDEVICE_A_MAX, .module = THIS_MODULE, .policy = device_policy, .netnsok = true }; int __init wg_genetlink_init(void) { return genl_register_family(&genl_family); } void __exit wg_genetlink_uninit(void) { genl_unregister_family(&genl_family); }
24 20 20 2 2 1 1 4 4 2 1 1 16 15 16 5 15 7 1 3 2 1 4 4 4 13 21 4 4 7 2 1 1 2 1 1 3 38 35 4 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/mISDNif.h> #include <linux/slab.h> #include <linux/export.h> #include "core.h" static u_int *debug; static struct proto mISDN_proto = { .name = "misdn", .owner = THIS_MODULE, .obj_size = sizeof(struct mISDN_sock) }; #define _pms(sk) ((struct mISDN_sock *)sk) static struct mISDN_sock_list data_sockets = { .lock = __RW_LOCK_UNLOCKED(data_sockets.lock) }; static struct mISDN_sock_list base_sockets = { .lock = __RW_LOCK_UNLOCKED(base_sockets.lock) }; #define L2_HEADER_LEN 4 static inline struct sk_buff * _l2_alloc_skb(unsigned int len, gfp_t gfp_mask) { struct sk_buff *skb; skb = alloc_skb(len + L2_HEADER_LEN, gfp_mask); if (likely(skb)) skb_reserve(skb, L2_HEADER_LEN); return skb; } static void mISDN_sock_link(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_add_node(sk, &l->head); write_unlock_bh(&l->lock); } static void mISDN_sock_unlink(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_del_node_init(sk); write_unlock_bh(&l->lock); } static int mISDN_send(struct mISDNchannel *ch, struct sk_buff *skb) { struct mISDN_sock *msk; int err; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s len %d %p\n", __func__, skb->len, skb); if (msk->sk.sk_state == MISDN_CLOSED) return -EUNATCH; __net_timestamp(skb); err = sock_queue_rcv_skb(&msk->sk, skb); if (err) printk(KERN_WARNING "%s: error %d\n", __func__, err); return err; } static int mISDN_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { struct mISDN_sock *msk; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %x, %p)\n", __func__, ch, cmd, arg); switch (cmd) { case CLOSE_CHANNEL: msk->sk.sk_state = MISDN_CLOSED; break; } return 0; } static inline void mISDN_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct __kernel_old_timeval tv; if (_pms(sk)->cmask & MISDN_TIME_STAMP) { skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_MISDN, MISDN_TIME_STAMP, sizeof(tv), &tv); } } static int mISDN_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sk_buff *skb; struct sock *sk = sock->sk; int copied, err; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d, flags %x ch.nr %d, proto %x\n", __func__, (int)len, flags, _pms(sk)->ch.nr, sk->sk_protocol); if (flags & (MSG_OOB)) return -EOPNOTSUPP; if (sk->sk_state == MISDN_CLOSED) return 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) { maddr->channel = (mISDN_HEAD_ID(skb) >> 16) & 0xff; maddr->tei = (mISDN_HEAD_ID(skb) >> 8) & 0xff; maddr->sapi = mISDN_HEAD_ID(skb) & 0xff; } else { maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xFF; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xFF; } msg->msg_namelen = sizeof(*maddr); } copied = skb->len + MISDN_HEADER_LEN; if (len < copied) { if (flags & MSG_PEEK) refcount_dec(&skb->users); else skb_queue_head(&sk->sk_receive_queue, skb); return -ENOSPC; } memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); err = skb_copy_datagram_msg(skb, 0, msg, copied); mISDN_sock_cmsg(sk, msg, skb); skb_free_datagram(sk, skb); return err ? : copied; } static int mISDN_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb; int err = -ENOMEM; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d flags %x ch %d proto %x\n", __func__, (int)len, msg->msg_flags, _pms(sk)->ch.nr, sk->sk_protocol); if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE)) return -EINVAL; if (len < MISDN_HEADER_LEN) return -EINVAL; if (sk->sk_state != MISDN_BOUND) return -EBADFD; lock_sock(sk); skb = _l2_alloc_skb(len, GFP_KERNEL); if (!skb) goto done; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto done; } memcpy(mISDN_HEAD_P(skb), skb->data, MISDN_HEADER_LEN); skb_pull(skb, MISDN_HEADER_LEN); if (msg->msg_namelen >= sizeof(struct sockaddr_mISDN)) { /* if we have a address, we use it */ DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); mISDN_HEAD_ID(skb) = maddr->channel; } else { /* use default for L2 messages */ if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) mISDN_HEAD_ID(skb) = _pms(sk)->ch.nr; } if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: ID:%x\n", __func__, mISDN_HEAD_ID(skb)); err = -ENODEV; if (!_pms(sk)->ch.peer) goto done; err = _pms(sk)->ch.recv(_pms(sk)->ch.peer, skb); if (err) goto done; else { skb = NULL; err = len; } done: kfree_skb(skb); release_sock(sk); return err; } static int data_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: if (sk->sk_state == MISDN_BOUND) delete_channel(&_pms(sk)->ch); else mISDN_sock_unlink(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: delete_channel(&_pms(sk)->ch); mISDN_sock_unlink(&data_sockets, sk); break; } lock_sock(sk); sock_orphan(sk); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int data_sock_ioctl_bound(struct sock *sk, unsigned int cmd, void __user *p) { struct mISDN_ctrl_req cq; int err = -EINVAL, val[2]; struct mISDNchannel *bchan, *next; lock_sock(sk); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } switch (cmd) { case IMCTRLREQ: if (copy_from_user(&cq, p, sizeof(cq))) { err = -EFAULT; break; } if ((sk->sk_protocol & ~ISDN_P_B_MASK) == ISDN_P_B_START) { list_for_each_entry_safe(bchan, next, &_pms(sk)->dev->bchannels, list) { if (bchan->nr == cq.channel) { err = bchan->ctrl(bchan, CONTROL_CHANNEL, &cq); break; } } } else err = _pms(sk)->dev->D.ctrl(&_pms(sk)->dev->D, CONTROL_CHANNEL, &cq); if (err) break; if (copy_to_user(p, &cq, sizeof(cq))) err = -EFAULT; break; case IMCLEAR_L2: if (sk->sk_protocol != ISDN_P_LAPD_NT) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; case IMHOLD_L1: if (sk->sk_protocol != ISDN_P_LAPD_NT && sk->sk_protocol != ISDN_P_LAPD_TE) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; default: err = -EINVAL; break; } done: release_sock(sk); return err; } static int data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct sock *sk = sock->sk; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; default: if (sk->sk_state == MISDN_BOUND) err = data_sock_ioctl_bound(sk, cmd, (void __user *)arg); else err = -ENOTCONN; } return err; } static int data_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0, opt = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %d, %x, optval, %d)\n", __func__, sock, level, optname, optlen); lock_sock(sk); switch (optname) { case MISDN_TIME_STAMP: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) _pms(sk)->cmask |= MISDN_TIME_STAMP; else _pms(sk)->cmask &= ~MISDN_TIME_STAMP; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int data_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, opt; if (get_user(len, optlen)) return -EFAULT; if (len != sizeof(char)) return -EINVAL; switch (optname) { case MISDN_TIME_STAMP: if (_pms(sk)->cmask & MISDN_TIME_STAMP) opt = 1; else opt = 0; if (put_user(opt, optval)) return -EFAULT; break; default: return -ENOPROTOOPT; } return 0; } static int data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; struct sock *csk; int err = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (addr_len != sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } if (sk->sk_protocol < ISDN_P_B_START) { read_lock_bh(&data_sockets.lock); sk_for_each(csk, &data_sockets.head) { if (sk == csk) continue; if (_pms(csk)->dev != _pms(sk)->dev) continue; if (csk->sk_protocol >= ISDN_P_B_START) continue; if (IS_ISDN_P_TE(csk->sk_protocol) == IS_ISDN_P_TE(sk->sk_protocol)) continue; read_unlock_bh(&data_sockets.lock); err = -EBUSY; goto done; } read_unlock_bh(&data_sockets.lock); } _pms(sk)->ch.send = mISDN_send; _pms(sk)->ch.ctrl = mISDN_ctrl; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: mISDN_sock_unlink(&data_sockets, sk); err = connect_layer1(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); if (err) mISDN_sock_link(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: err = create_l2entity(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = connect_Bstack(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; default: err = -EPROTONOSUPPORT; } if (err) goto done; sk->sk_state = MISDN_BOUND; _pms(sk)->ch.protocol = sk->sk_protocol; done: release_sock(sk); return err; } static int data_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; if (!_pms(sk)->dev) return -EBADFD; lock_sock(sk); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xff; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff; release_sock(sk); return sizeof(*maddr); } static const struct proto_ops data_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = data_sock_release, .ioctl = data_sock_ioctl, .bind = data_sock_bind, .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, .getsockopt = data_sock_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int data_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &data_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&data_sockets, sk); return 0; } static int base_sock_release(struct socket *sock) { struct sock *sk = sock->sk; printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; mISDN_sock_unlink(&base_sockets, sk); sock_orphan(sk); sock_put(sk); return 0; } static int base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; case IMSETDEVNAME: { struct mISDN_devrename dn; if (copy_from_user(&dn, (void __user *)arg, sizeof(dn))) { err = -EFAULT; break; } dn.name[sizeof(dn.name) - 1] = '\0'; dev = get_mdevice(dn.id); if (dev) err = device_rename(&dev->dev, dn.name); else err = -ENODEV; } break; default: err = -EINVAL; } return err; } static int base_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; int err = 0; if (addr_len < sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } sk->sk_state = MISDN_BOUND; done: release_sock(sk); return err; } static const struct proto_ops base_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = base_sock_release, .ioctl = base_sock_ioctl, .bind = base_sock_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int base_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (!capable(CAP_NET_RAW)) return -EPERM; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &base_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&base_sockets, sk); return 0; } static int mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int err = -EPROTONOSUPPORT; switch (proto) { case ISDN_P_BASE: err = base_sock_create(net, sock, proto, kern); break; case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = data_sock_create(net, sock, proto, kern); break; default: return err; } return err; } static const struct net_proto_family mISDN_sock_family_ops = { .owner = THIS_MODULE, .family = PF_ISDN, .create = mISDN_sock_create, }; int misdn_sock_init(u_int *deb) { int err; debug = deb; err = sock_register(&mISDN_sock_family_ops); if (err) printk(KERN_ERR "%s: error(%d)\n", __func__, err); return err; } void misdn_sock_cleanup(void) { sock_unregister(PF_ISDN); }
755 257 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FIB_LOOKUP_H #define _FIB_LOOKUP_H #include <linux/types.h> #include <linux/list.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; struct fib_info *fa_info; dscp_t fa_dscp; u8 fa_type; u8 fa_state; u8 fa_slen; u32 tb_id; s16 fa_default; u8 offload; u8 trap; u8 offload_failed; struct rcu_head rcu; }; #define FA_S_ACCESSED 0x01 /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) fa->fa_state |= FA_S_ACCESSED; } /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); size_t fib_nlmsg_size(struct fib_info *fi); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { int error; u8 scope; }; extern const struct fib_prop fib_props[RTN_MAX + 1]; #endif /* _FIB_LOOKUP_H */
1040 1039 437 437 135 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 // SPDX-License-Identifier: GPL-2.0-only /* * Monitoring code for network dropped packet alerts * * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <linux/bitfield.h> #include <linux/percpu.h> #include <linux/timer.h> #include <linux/bitops.h> #include <linux/slab.h> #include <linux/module.h> #include <net/genetlink.h> #include <net/netevent.h> #include <net/flow_offload.h> #include <net/dropreason.h> #include <net/devlink.h> #include <trace/events/skb.h> #include <trace/events/napi.h> #include <trace/events/devlink.h> #include <linux/unaligned.h> #define TRACE_ON 1 #define TRACE_OFF 0 /* * Globals, our netlink socket pointer * and the work handle that will send up * netlink alerts */ static int trace_state = TRACE_OFF; static bool monitor_hw; /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. */ static DEFINE_MUTEX(net_dm_mutex); struct net_dm_stats { u64_stats_t dropped; struct u64_stats_sync syncp; }; #define NET_DM_MAX_HW_TRAP_NAME_LEN 40 struct net_dm_hw_entry { char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN]; u32 count; }; struct net_dm_hw_entries { u32 num_entries; struct net_dm_hw_entry entries[]; }; struct per_cpu_dm_data { raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { struct sk_buff *skb; struct net_dm_hw_entries *hw_entries; }; struct sk_buff_head drop_queue; struct work_struct dm_alert_work; struct timer_list send_timer; struct net_dm_stats stats; }; struct dm_hw_stat_delta { unsigned long last_rx; unsigned long last_drop_val; struct rcu_head rcu; }; static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason, struct sock *rx_sk); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); void (*hw_work_item_func)(struct work_struct *work); void (*hw_trap_probe)(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata); }; struct net_dm_skb_cb { union { struct devlink_trap_metadata *hw_metadata; void *pc; }; enum skb_drop_reason reason; }; #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; struct net_dm_alert_msg *msg; struct nlattr *nla; struct sk_buff *skb; unsigned long flags; void *msg_header; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); al += sizeof(struct nlattr); skb = genlmsg_new(al, GFP_KERNEL); if (!skb) goto err; msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); if (!msg_header) { nlmsg_free(skb); skb = NULL; goto err; } nla = nla_reserve(skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); if (!nla) { nlmsg_free(skb); skb = NULL; goto err; } msg = nla_data(nla); memset(msg, 0, al); goto out; err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: raw_spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); raw_spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh); genlmsg_end(skb, genlmsg_data(gnlh)); } return skb; } static const struct genl_multicast_group dropmon_mcgrps[] = { { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, }, }; static void send_dm_alert(struct work_struct *work) { struct sk_buff *skb; struct per_cpu_dm_data *data; data = container_of(work, struct per_cpu_dm_data, dm_alert_work); skb = reset_per_cpu_data(data); if (skb) genlmsg_multicast(&net_drop_monitor_family, skb, 0, 0, GFP_KERNEL); } /* * This is the timer function to delay the sending of an alert * in the event that more drops will arrive during the * hysteresis period. */ static void sched_send_work(struct timer_list *t) { struct per_cpu_dm_data *data = from_timer(data, t, send_timer); schedule_work(&data->dm_alert_work); } static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct net_dm_drop_point *point; struct nlmsghdr *nlh; struct nlattr *nla; int i; struct sk_buff *dskb; struct per_cpu_dm_data *data; unsigned long flags; local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); raw_spin_lock(&data->lock); dskb = data->skb; if (!dskb) goto out; nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); point = msg->points; for (i = 0; i < msg->entries; i++) { if (!memcmp(&location, &point->pc, sizeof(void *))) { point->count++; goto out; } point++; } if (msg->entries == dm_hit_limit) goto out; /* * We need to create a new entry */ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); memcpy(point->pc, &location, sizeof(void *)); point->count = 1; msg->entries++; if (!timer_pending(&data->send_timer)) { data->send_timer.expires = jiffies + dm_delay * HZ; add_timer(&data->send_timer); } out: raw_spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason, struct sock *rx_sk) { trace_drop_common(skb, location); } static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { struct net_device *dev = napi->dev; struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ if (!dev) return; rcu_read_lock(); stat = rcu_dereference(dev->dm_private); if (stat) { /* * only add a note to our monitor buffer if: * 1) its after the last_rx delta * 2) our rx_dropped count has gone up */ if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); stat->last_drop_val = dev->stats.rx_dropped; stat->last_rx = jiffies; } } rcu_read_unlock(); } static struct net_dm_hw_entries * net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) { struct net_dm_hw_entries *hw_entries; unsigned long flags; hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit), GFP_KERNEL); if (!hw_entries) { /* If the memory allocation failed, we try to perform another * allocation in 1/10 second. Otherwise, the probe function * will constantly bail out. */ mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } raw_spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); raw_spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } static int net_dm_hw_entry_put(struct sk_buff *msg, const struct net_dm_hw_entry *hw_entry) { struct nlattr *attr; attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY); if (!attr) return -EMSGSIZE; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_hw_entries_put(struct sk_buff *msg, const struct net_dm_hw_entries *hw_entries) { struct nlattr *attr; int i; attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES); if (!attr) return -EMSGSIZE; for (i = 0; i < hw_entries->num_entries; i++) { int rc; rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]); if (rc) goto nla_put_failure; } nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_hw_summary_report_fill(struct sk_buff *msg, const struct net_dm_hw_entries *hw_entries) { struct net_dm_alert_msg anc_hdr = { 0 }; void *hdr; int rc; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); if (!hdr) return -EMSGSIZE; /* We need to put the ancillary header in order not to break user * space. */ if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr)) goto nla_put_failure; rc = net_dm_hw_entries_put(msg, hw_entries); if (rc) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void net_dm_hw_summary_work(struct work_struct *work) { struct net_dm_hw_entries *hw_entries; struct per_cpu_dm_data *hw_data; struct sk_buff *msg; int rc; hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); if (!hw_entries) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out; rc = net_dm_hw_summary_report_fill(msg, hw_entries); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: kfree(hw_entries); } static void net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata) { struct net_dm_hw_entries *hw_entries; struct net_dm_hw_entry *hw_entry; struct per_cpu_dm_data *hw_data; unsigned long flags; int i; if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); raw_spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) goto out; for (i = 0; i < hw_entries->num_entries; i++) { hw_entry = &hw_entries->entries[i]; if (!strncmp(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) { hw_entry->count++; goto out; } } if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit)) goto out; hw_entry = &hw_entries->entries[hw_entries->num_entries]; strscpy(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1); hw_entry->count = 1; hw_entries->num_entries++; if (!timer_pending(&hw_data->send_timer)) { hw_data->send_timer.expires = jiffies + dm_delay * HZ; add_timer(&hw_data->send_timer); } out: raw_spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { .kfree_skb_probe = trace_kfree_skb_hit, .napi_poll_probe = trace_napi_poll_hit, .work_item_func = send_dm_alert, .hw_work_item_func = net_dm_hw_summary_work, .hw_trap_probe = net_dm_hw_trap_summary_probe, }; static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason, struct sock *rx_sk) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; struct net_dm_skb_cb *cb; struct sk_buff *nskb; unsigned long flags; if (!skb_mac_header_was_set(skb)) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; cb = NET_DM_SKB_CB(nskb); cb->reason = reason; cb->pc = location; /* Override the timestamp because we care about the time when the * packet was dropped. */ nskb->tstamp = tstamp; data = this_cpu_ptr(&dm_cpu_data); spin_lock_irqsave(&data->drop_queue.lock, flags); if (skb_queue_len(&data->drop_queue) < net_dm_queue_len) __skb_queue_tail(&data->drop_queue, nskb); else goto unlock_free; spin_unlock_irqrestore(&data->drop_queue.lock, flags); schedule_work(&data->dm_alert_work); return; unlock_free: spin_unlock_irqrestore(&data->drop_queue.lock, flags); u64_stats_update_begin(&data->stats.syncp); u64_stats_inc(&data->stats.dropped); u64_stats_update_end(&data->stats.syncp); consume_skb(nskb); } static void net_dm_packet_trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { } static size_t net_dm_in_port_size(void) { /* NET_DM_ATTR_IN_PORT nest */ return nla_total_size(0) + /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PORT_NETDEV_NAME */ nla_total_size(IFNAMSIZ + 1); } #define NET_DM_MAX_SYMBOL_LEN 40 #define NET_DM_MAX_REASON_LEN 50 static size_t net_dm_packet_report_size(size_t payload_len) { size_t size; size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); return NLMSG_ALIGN(size) + /* NET_DM_ATTR_ORIGIN */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PC */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_SYMBOL */ nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_REASON */ nla_total_size(NET_DM_MAX_REASON_LEN + 1) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex, const char *name) { struct nlattr *attr; attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT); if (!attr) return -EMSGSIZE; if (ifindex && nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex)) goto nla_put_failure; if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); const struct drop_reason_list *list = NULL; unsigned int subsys, subsys_reason; char buf[NET_DM_MAX_SYMBOL_LEN]; struct nlattr *attr; void *hdr; int rc; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_PACKET_ALERT); if (!hdr) return -EMSGSIZE; if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc, NET_DM_ATTR_PAD)) goto nla_put_failure; rcu_read_lock(); subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK); if (subsys < SKB_DROP_REASON_SUBSYS_NUM) list = rcu_dereference(drop_reasons_by_subsys[subsys]); subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK; if (!list || subsys_reason >= list->n_reasons || !list->reasons[subsys_reason] || strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) { list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]); subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED; } if (nla_put_string(msg, NET_DM_ATTR_REASON, list->reasons[subsys_reason])) { rcu_read_unlock(); goto nla_put_failure; } rcu_read_unlock(); snprintf(buf, sizeof(buf), "%pS", cb->pc); if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) goto nla_put_failure; rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL); if (rc) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) goto nla_put_failure; if (!payload_len) goto out; if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) goto nla_put_failure; attr = skb_put(msg, nla_total_size(payload_len)); attr->nla_type = NET_DM_ATTR_PAYLOAD; attr->nla_len = nla_attr_size(payload_len); if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } #define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO) static void net_dm_packet_report(struct sk_buff *skb) { struct sk_buff *msg; size_t payload_len; int rc; /* Make sure we start copying the packet from the MAC header */ if (skb->data > skb_mac_header(skb)) skb_push(skb, skb->data - skb_mac_header(skb)); else skb_pull(skb, skb_mac_header(skb) - skb->data); /* Ensure packet fits inside a single netlink attribute */ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); if (!msg) goto out; rc = net_dm_packet_report_fill(msg, skb, payload_len); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: consume_skb(skb); } static void net_dm_packet_work(struct work_struct *work) { struct per_cpu_dm_data *data; struct sk_buff_head list; struct sk_buff *skb; unsigned long flags; data = container_of(work, struct per_cpu_dm_data, dm_alert_work); __skb_queue_head_init(&list); spin_lock_irqsave(&data->drop_queue.lock, flags); skb_queue_splice_tail_init(&data->drop_queue, &list); spin_unlock_irqrestore(&data->drop_queue.lock, flags); while ((skb = __skb_dequeue(&list))) net_dm_packet_report(skb); } static size_t net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata) { return hw_metadata->fa_cookie ? nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; } static size_t net_dm_hw_packet_report_size(size_t payload_len, const struct devlink_trap_metadata *hw_metadata) { size_t size; size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); return NLMSG_ALIGN(size) + /* NET_DM_ATTR_ORIGIN */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */ nla_total_size(strlen(hw_metadata->trap_group_name) + 1) + /* NET_DM_ATTR_HW_TRAP_NAME */ nla_total_size(strlen(hw_metadata->trap_name) + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + /* NET_DM_ATTR_FLOW_ACTION_COOKIE */ net_dm_flow_action_cookie_size(hw_metadata) + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } static int net_dm_hw_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { struct devlink_trap_metadata *hw_metadata; struct nlattr *attr; void *hdr; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_PACKET_ALERT); if (!hdr) return -EMSGSIZE; if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW)) goto nla_put_failure; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME, hw_metadata->trap_group_name)) goto nla_put_failure; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_metadata->trap_name)) goto nla_put_failure; if (hw_metadata->input_dev) { struct net_device *dev = hw_metadata->input_dev; int rc; rc = net_dm_packet_report_in_port_put(msg, dev->ifindex, dev->name); if (rc) goto nla_put_failure; } if (hw_metadata->fa_cookie && nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE, hw_metadata->fa_cookie->cookie_len, hw_metadata->fa_cookie->cookie)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) goto nla_put_failure; if (!payload_len) goto out; if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) goto nla_put_failure; attr = skb_put(msg, nla_total_size(payload_len)); attr->nla_type = NET_DM_ATTR_PAYLOAD; attr->nla_len = nla_attr_size(payload_len); if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct devlink_trap_metadata * net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) { const struct flow_action_cookie *fa_cookie; struct devlink_trap_metadata *hw_metadata; const char *trap_group_name; const char *trap_name; hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); if (!hw_metadata) return NULL; trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC); if (!trap_group_name) goto free_hw_metadata; hw_metadata->trap_group_name = trap_group_name; trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC); if (!trap_name) goto free_trap_group; hw_metadata->trap_name = trap_name; if (metadata->fa_cookie) { size_t cookie_size = sizeof(*fa_cookie) + metadata->fa_cookie->cookie_len; fa_cookie = kmemdup(metadata->fa_cookie, cookie_size, GFP_ATOMIC); if (!fa_cookie) goto free_trap_name; hw_metadata->fa_cookie = fa_cookie; } hw_metadata->input_dev = metadata->input_dev; netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker, GFP_ATOMIC); return hw_metadata; free_trap_name: kfree(trap_name); free_trap_group: kfree(trap_group_name); free_hw_metadata: kfree(hw_metadata); return NULL; } static void net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata) { netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); kfree(hw_metadata); } static void net_dm_hw_packet_report(struct sk_buff *skb) { struct devlink_trap_metadata *hw_metadata; struct sk_buff *msg; size_t payload_len; int rc; if (skb->data > skb_mac_header(skb)) skb_push(skb, skb->data - skb_mac_header(skb)); else skb_pull(skb, skb_mac_header(skb) - skb->data); payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata), GFP_KERNEL); if (!msg) goto out; rc = net_dm_hw_packet_report_fill(msg, skb, payload_len); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata); consume_skb(skb); } static void net_dm_hw_packet_work(struct work_struct *work) { struct per_cpu_dm_data *hw_data; struct sk_buff_head list; struct sk_buff *skb; unsigned long flags; hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); __skb_queue_head_init(&list); spin_lock_irqsave(&hw_data->drop_queue.lock, flags); skb_queue_splice_tail_init(&hw_data->drop_queue, &list); spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); while ((skb = __skb_dequeue(&list))) net_dm_hw_packet_report(skb); } static void net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata) { struct devlink_trap_metadata *n_hw_metadata; ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *hw_data; struct sk_buff *nskb; unsigned long flags; if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) return; if (!skb_mac_header_was_set(skb)) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; n_hw_metadata = net_dm_hw_metadata_copy(metadata); if (!n_hw_metadata) goto free; NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata; nskb->tstamp = tstamp; hw_data = this_cpu_ptr(&dm_hw_cpu_data); spin_lock_irqsave(&hw_data->drop_queue.lock, flags); if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len) __skb_queue_tail(&hw_data->drop_queue, nskb); else goto unlock_free; spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); schedule_work(&hw_data->dm_alert_work); return; unlock_free: spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); u64_stats_update_begin(&hw_data->stats.syncp); u64_stats_inc(&hw_data->stats.dropped); u64_stats_update_end(&hw_data->stats.syncp); net_dm_hw_metadata_free(n_hw_metadata); free: consume_skb(nskb); } static const struct net_dm_alert_ops net_dm_alert_packet_ops = { .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit, .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, .work_item_func = net_dm_packet_work, .hw_work_item_func = net_dm_hw_packet_work, .hw_trap_probe = net_dm_hw_trap_packet_probe, }; static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops, [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, }; #if IS_ENABLED(CONFIG_NET_DEVLINK) static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL); } static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) { unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL); tracepoint_synchronize_unregister(); } #else static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { return -EOPNOTSUPP; } static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) { } #endif static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu, rc; if (monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled"); return -EAGAIN; } ops = net_dm_alert_ops_arr[net_dm_alert_mode]; if (!try_module_get(THIS_MODULE)) { NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); return -ENODEV; } for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct net_dm_hw_entries *hw_entries; INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func); timer_setup(&hw_data->send_timer, sched_send_work, 0); hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); kfree(hw_entries); } rc = net_dm_hw_probe_register(ops); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint"); goto err_module_put; } monitor_hw = true; return 0; err_module_put: for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); consume_skb(skb); } } module_put(THIS_MODULE); return rc; } static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu; if (!monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); return; } ops = net_dm_alert_ops_arr[net_dm_alert_mode]; monitor_hw = false; net_dm_hw_probe_unregister(ops); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); consume_skb(skb); } } module_put(THIS_MODULE); } static int net_dm_trace_on_set(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu, rc; ops = net_dm_alert_ops_arr[net_dm_alert_mode]; if (!try_module_get(THIS_MODULE)) { NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); return -ENODEV; } for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; INIT_WORK(&data->dm_alert_work, ops->work_item_func); timer_setup(&data->send_timer, sched_send_work, 0); /* Allocate a new per-CPU skb for the summary alert message and * free the old one which might contain stale data from * previous tracing. */ skb = reset_per_cpu_data(data); consume_skb(skb); } rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint"); goto err_module_put; } rc = register_trace_napi_poll(ops->napi_poll_probe, NULL); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint"); goto err_unregister_trace; } return 0; err_unregister_trace: unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); err_module_put: for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); } module_put(THIS_MODULE); return rc; } static void net_dm_trace_off_set(void) { const struct net_dm_alert_ops *ops; int cpu; ops = net_dm_alert_ops_arr[net_dm_alert_mode]; unregister_trace_napi_poll(ops->napi_poll_probe, NULL); unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); tracepoint_synchronize_unregister(); /* Make sure we do not send notifications to user space after request * to stop tracing returns. */ for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); } module_put(THIS_MODULE); } static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack) { int rc = 0; if (state == trace_state) { NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state"); return -EAGAIN; } switch (state) { case TRACE_ON: rc = net_dm_trace_on_set(extack); break; case TRACE_OFF: net_dm_trace_off_set(); break; default: rc = 1; break; } if (!rc) trace_state = state; else rc = -EINPROGRESS; return rc; } static bool net_dm_is_monitoring(void) { return trace_state == TRACE_ON || monitor_hw; } static int net_dm_alert_mode_get_from_info(struct genl_info *info, enum net_dm_alert_mode *p_alert_mode) { u8 val; val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]); switch (val) { case NET_DM_ALERT_MODE_SUMMARY: case NET_DM_ALERT_MODE_PACKET: *p_alert_mode = val; break; default: return -EINVAL; } return 0; } static int net_dm_alert_mode_set(struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; enum net_dm_alert_mode alert_mode; int rc; if (!info->attrs[NET_DM_ATTR_ALERT_MODE]) return 0; rc = net_dm_alert_mode_get_from_info(info, &alert_mode); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode"); return -EINVAL; } net_dm_alert_mode = alert_mode; return 0; } static void net_dm_trunc_len_set(struct genl_info *info) { if (!info->attrs[NET_DM_ATTR_TRUNC_LEN]) return; net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]); } static void net_dm_queue_len_set(struct genl_info *info) { if (!info->attrs[NET_DM_ATTR_QUEUE_LEN]) return; net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]); } static int net_dm_cmd_config(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; int rc; if (net_dm_is_monitoring()) { NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring"); return -EBUSY; } rc = net_dm_alert_mode_set(info); if (rc) return rc; net_dm_trunc_len_set(info); net_dm_queue_len_set(info); return 0; } static int net_dm_monitor_start(bool set_sw, bool set_hw, struct netlink_ext_ack *extack) { bool sw_set = false; int rc; if (set_sw) { rc = set_all_monitor_traces(TRACE_ON, extack); if (rc) return rc; sw_set = true; } if (set_hw) { rc = net_dm_hw_monitor_start(extack); if (rc) goto err_monitor_hw; } return 0; err_monitor_hw: if (sw_set) set_all_monitor_traces(TRACE_OFF, extack); return rc; } static void net_dm_monitor_stop(bool set_sw, bool set_hw, struct netlink_ext_ack *extack) { if (set_hw) net_dm_hw_monitor_stop(extack); if (set_sw) set_all_monitor_traces(TRACE_OFF, extack); } static int net_dm_cmd_trace(struct sk_buff *skb, struct genl_info *info) { bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS]; bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS]; struct netlink_ext_ack *extack = info->extack; /* To maintain backward compatibility, we start / stop monitoring of * software drops if no flag is specified. */ if (!set_sw && !set_hw) set_sw = true; switch (info->genlhdr->cmd) { case NET_DM_CMD_START: return net_dm_monitor_start(set_sw, set_hw, extack); case NET_DM_CMD_STOP: net_dm_monitor_stop(set_sw, set_hw, extack); return 0; } return -EOPNOTSUPP; } static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info) { void *hdr; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW); if (!hdr) return -EMSGSIZE; if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; int rc; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rc = net_dm_config_fill(msg, info); if (rc) goto free_msg; return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); return rc; } static void net_dm_stats_read(struct net_dm_stats *stats) { int cpu; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct net_dm_stats *cpu_stats = &data->stats; unsigned int start; u64 dropped; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } } static int net_dm_stats_put(struct sk_buff *msg) { struct net_dm_stats stats; struct nlattr *attr; net_dm_stats_read(&stats); attr = nla_nest_start(msg, NET_DM_ATTR_STATS); if (!attr) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static void net_dm_hw_stats_read(struct net_dm_stats *stats) { int cpu; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct net_dm_stats *cpu_stats = &hw_data->stats; unsigned int start; u64 dropped; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } } static int net_dm_hw_stats_put(struct sk_buff *msg) { struct net_dm_stats stats; struct nlattr *attr; net_dm_hw_stats_read(&stats); attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS); if (!attr) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info) { void *hdr; int rc; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW); if (!hdr) return -EMSGSIZE; rc = net_dm_stats_put(msg); if (rc) goto nla_put_failure; rc = net_dm_hw_stats_put(msg); if (rc) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; int rc; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rc = net_dm_stats_fill(msg, info); if (rc) goto free_msg; return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); return rc; } static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) break; stat = kzalloc(sizeof(*stat), GFP_KERNEL); if (!stat) break; stat->last_rx = jiffies; rcu_assign_pointer(dev->dm_private, stat); break; case NETDEV_UNREGISTER: stat = rtnl_dereference(dev->dm_private); if (stat) { rcu_assign_pointer(dev->dm_private, NULL); kfree_rcu(stat, rcu); } break; } return NOTIFY_DONE; } static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = { [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 }, [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 }, [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 }, [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 }, [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG }, [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG }, }; static const struct genl_small_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_config, .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_CONFIG_GET, .doit = net_dm_cmd_config_get, }, { .cmd = NET_DM_CMD_STATS_GET, .doit = net_dm_cmd_stats_get, }, }; static int net_dm_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_lock(&net_dm_mutex); return 0; } static void net_dm_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_unlock(&net_dm_mutex); } static struct genl_family net_drop_monitor_family __ro_after_init = { .hdrsize = 0, .name = "NET_DM", .version = 2, .maxattr = NET_DM_ATTR_MAX, .policy = net_dm_nl_policy, .pre_doit = net_dm_nl_pre_doit, .post_doit = net_dm_nl_post_doit, .module = THIS_MODULE, .small_ops = dropmon_ops, .n_small_ops = ARRAY_SIZE(dropmon_ops), .resv_start_op = NET_DM_CMD_STATS_GET + 1, .mcgrps = dropmon_mcgrps, .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), }; static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { raw_spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data) { WARN_ON(!skb_queue_empty(&data->drop_queue)); } static void net_dm_cpu_data_init(int cpu) { struct per_cpu_dm_data *data; data = &per_cpu(dm_cpu_data, cpu); __net_dm_cpu_data_init(data); } static void net_dm_cpu_data_fini(int cpu) { struct per_cpu_dm_data *data; data = &per_cpu(dm_cpu_data, cpu); /* At this point, we should have exclusive access * to this struct and can free the skb inside it. */ consume_skb(data->skb); __net_dm_cpu_data_fini(data); } static void net_dm_hw_cpu_data_init(int cpu) { struct per_cpu_dm_data *hw_data; hw_data = &per_cpu(dm_hw_cpu_data, cpu); __net_dm_cpu_data_init(hw_data); } static void net_dm_hw_cpu_data_fini(int cpu) { struct per_cpu_dm_data *hw_data; hw_data = &per_cpu(dm_hw_cpu_data, cpu); kfree(hw_data->hw_entries); __net_dm_cpu_data_fini(hw_data); } static int __init init_net_drop_monitor(void) { int cpu, rc; pr_info("Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { pr_err("Unable to store program counters on this arch, Drop monitor failed\n"); return -ENOSPC; } rc = genl_register_family(&net_drop_monitor_family); if (rc) { pr_err("Could not create drop monitor netlink family\n"); return rc; } WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); goto out_unreg; } rc = 0; for_each_possible_cpu(cpu) { net_dm_cpu_data_init(cpu); net_dm_hw_cpu_data_init(cpu); } goto out; out_unreg: genl_unregister_family(&net_drop_monitor_family); out: return rc; } static void exit_net_drop_monitor(void) { int cpu; BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); module_exit(exit_net_drop_monitor); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); MODULE_ALIAS_GENL_FAMILY("NET_DM"); MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts");
4 4 4 4 4 68 66 2 9 4 5 13 118 118 188 188 188 188 188 187 187 187 177 145 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/tcp.h> #include <linux/rcupdate.h> #include <net/tcp.h> void tcp_fastopen_init_key_once(struct net *net) { u8 key[TCP_FASTOPEN_KEY_LENGTH]; struct tcp_fastopen_context *ctxt; rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) { rcu_read_unlock(); return; } rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. * * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); tcp_fastopen_reset_cipher(net, NULL, key, NULL); } static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); kfree_sensitive(ctx); } void tcp_fastopen_destroy_cipher(struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference_protected( inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1); if (ctx) call_rcu(&ctx->rcu, tcp_fastopen_ctx_free); } void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL)); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); } int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key) { struct tcp_fastopen_context *ctx, *octx; struct fastopen_queue *q; int err = 0; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } ctx->key[0].key[0] = get_unaligned_le64(primary_key); ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8); if (backup_key) { ctx->key[1].key[0] = get_unaligned_le64(backup_key); ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8); ctx->num = 2; } else { ctx->num = 1; } if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx))); } else { octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, RCU_INITIALIZER(ctx))); } if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); out: return err; } int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, u64 *key) { struct tcp_fastopen_context *ctx; int n_keys = 0, i; rcu_read_lock(); if (icsk) ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); else ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctx) { n_keys = tcp_fastopen_context_len(ctx); for (i = 0; i < n_keys; i++) { put_unaligned_le64(ctx->key[i].key[0], key + (i * 2)); put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1); } } rcu_read_unlock(); return n_keys; } static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, struct sk_buff *syn, const siphash_key_t *key, struct tcp_fastopen_cookie *foc) { BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64)); if (req->rsk_ops->family == AF_INET) { const struct iphdr *iph = ip_hdr(syn); foc->val[0] = cpu_to_le64(siphash(&iph->saddr, sizeof(iph->saddr) + sizeof(iph->daddr), key)); foc->len = TCP_FASTOPEN_COOKIE_SIZE; return true; } #if IS_ENABLED(CONFIG_IPV6) if (req->rsk_ops->family == AF_INET6) { const struct ipv6hdr *ip6h = ipv6_hdr(syn); foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr, sizeof(ip6h->saddr) + sizeof(ip6h->daddr), key)); foc->len = TCP_FASTOPEN_COOKIE_SIZE; return true; } #endif return false; } /* Generate the fastopen cookie by applying SipHash to both the source and * destination addresses. */ static void tcp_fastopen_cookie_gen(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; rcu_read_lock(); ctx = tcp_fastopen_get_ctx(sk); if (ctx) __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc); rcu_read_unlock(); } /* If an incoming SYN or SYNACK frame contains a payload and/or FIN, * queue this additional data / FIN. */ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; skb_dst_drop(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects * skb->len to include the tcp_hdrlen. Hence, it should * be called before __skb_pull(). */ tp->segs_in = 0; tcp_segs_in(tp, skb); __skb_pull(skb, tcp_hdrlen(skb)); sk_forced_mem_schedule(sk, skb->truesize); skb_set_owner_r(skb, sk); TCP_SKB_CB(skb)->seq++; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->sk_receive_queue, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, * as we certainly are not changing upper 32bit value (0) */ tp->bytes_received = skb->len; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); } /* returns 0 - no key match, 1 for primary, 2 for backup */ static int tcp_fastopen_cookie_gen_check(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *orig, struct tcp_fastopen_cookie *valid_foc) { struct tcp_fastopen_cookie search_foc = { .len = -1 }; struct tcp_fastopen_cookie *foc = valid_foc; struct tcp_fastopen_context *ctx; int i, ret = 0; rcu_read_lock(); ctx = tcp_fastopen_get_ctx(sk); if (!ctx) goto out; for (i = 0; i < tcp_fastopen_context_len(ctx); i++) { __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc); if (tcp_fastopen_cookie_match(foc, orig)) { ret = i + 1; goto out; } foc = &search_foc; } out: rcu_read_unlock(); return ret; } static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; bool own_req; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, NULL, &own_req); if (!child) return NULL; spin_lock(&queue->fastopenq.lock); queue->fastopenq.qlen++; spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created * only out of the bits carried in the SYN packet. */ tp = tcp_sk(child); rcu_assign_pointer(tp->fastopen_rsk, req); tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. */ tp->snd_wnd = ntohs(tcp_hdr(skb)->window); tp->max_window = tp->snd_wnd; /* Activate the retrans timer so that SYNACK can be retransmitted. * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ req->timeout = tcp_timeout_init(child); inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, req->timeout, TCP_RTO_MAX); refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_fastopen_add_skb(child, skb); tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ return child; } static bool tcp_fastopen_queue_check(struct sock *sk) { struct fastopen_queue *fastopenq; int max_qlen; /* Make sure the listener has enabled fastopen, and we don't * exceed the max # of pending TFO requests allowed before trying * to validating the cookie in order to avoid burning CPU cycles * unnecessarily. * * XXX (TFO) - The implication of checking the max_qlen before * processing a cookie request is that clients can't differentiate * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; max_qlen = READ_ONCE(fastopenq->max_qlen); if (max_qlen == 0) return false; if (fastopenq->qlen >= max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); spin_unlock(&fastopenq->lock); return false; } fastopenq->rskq_rst_head = req1->dl_next; fastopenq->qlen--; spin_unlock(&fastopenq->lock); reqsk_put(req1); } return true; } static bool tcp_fastopen_no_cookie(const struct sock *sk, const struct dst_entry *dst, int flag) { return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) || tcp_sk(sk)->fastopen_no_cookie || (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); } /* Returns true if we should perform Fast Open on the SYN. The cookie (foc) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst) { bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; int ret = 0; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len == 0) { /* Client requests a cookie. */ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc); } else if (foc->len > 0) { ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc, &valid_foc); if (!ret) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else { /* Cookie is valid. Create a (full) child socket to * accept the data in SYN before returning a SYN-ACK to * ack the data. If we fail to create the socket, fall * back and ack the ISN only but includes the same * cookie. * * Note: Data-less SYN with valid cookie is allowed to * send data in SYN_RECV state. */ fastopen: child = tcp_fastopen_create_child(sk, skb, req); if (child) { if (ret == 2) { valid_foc.exp = foc->exp; *foc = valid_foc; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEALTKEY); } else { foc->len = -1; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); return child; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } } valid_foc.exp = foc->exp; *foc = valid_foc; return NULL; } bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { const struct dst_entry *dst; tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { cookie->len = -1; return false; } dst = __sk_dst_get(sk); if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) { cookie->len = -1; return true; } if (cookie->len > 0) return true; tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE; return false; } /* This function checks if we want to defer sending SYN until the first * write(). We defer under the following conditions: * 1. fastopen_connect sockopt is set * 2. we have a valid cookie * Return value: return true if we want to defer until application writes data * return false if we want to send out SYN immediately */ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) { struct tcp_fastopen_cookie cookie = { .len = 0 }; struct tcp_sock *tp = tcp_sk(sk); u16 mss; if (tp->fastopen_connect && !tp->fastopen_req) { if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) { inet_set_bit(DEFER_CONNECT, sk); return true; } /* Alloc fastopen_req in order for FO option to be included * in SYN */ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req), sk->sk_allocation); if (tp->fastopen_req) tp->fastopen_req->cookie = cookie; else *err = -ENOBUFS; } return false; } EXPORT_SYMBOL(tcp_fastopen_defer_connect); /* * The following code block is to deal with middle box issues with TFO: * Middlebox firewall issues can potentially cause server's data being * blackholed after a successful 3WHS using TFO. * The proposed solution is to disable active TFO globally under the * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST * 3. client side TFO socket has timed out three times consecutively during * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active * TFO connection with data exchanges. */ /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { struct net *net = sock_net(sk); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout)) return; /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */ WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies); /* Paired with smp_rmb() in tcp_fastopen_active_should_disable(). * We want net->ipv4.tfo_active_disable_stamp to be updated first. */ smp_mb__before_atomic(); atomic_inc(&net->ipv4.tfo_active_disable_times); NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable * Return true if we are still in the active TFO disable period * Return false if timeout already expired and we should use active TFO */ bool tcp_fastopen_active_should_disable(struct sock *sk) { unsigned int tfo_bh_timeout = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout); unsigned long timeout; int tfo_da_times; int multiplier; if (!tfo_bh_timeout) return false; tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); if (!tfo_da_times) return false; /* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */ smp_rmb(); /* Limit timeout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); /* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */ timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) + multiplier * tfo_bh_timeout * HZ; if (time_before(jiffies, timeout)) return true; /* Mark check bit so we can check for successful active TFO * condition and reset tfo_active_disable_times */ tcp_sk(sk)->syn_fastopen_ch = 1; return false; } /* Disable active TFO if FIN is the only packet in the ofo queue * and no data is received. * Also check if we can reset tfo_active_disable_times if data is * received successfully on a marked active TFO sockets opened on * a non-loopback interface */ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst; struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { skb = skb_rb_first(&tp->out_of_order_queue); if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; } } } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) { u32 timeouts = inet_csk(sk)->icsk_retransmits; struct tcp_sock *tp = tcp_sk(sk); /* Broken middle-boxes may black-hole Fast Open connection during or * even after the handshake. Be extremely conservative and pause * Fast Open globally after hitting the third consecutive timeout or * exceeding the configured timeout limit. */ if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && (timeouts == 2 || (timeouts < 2 && expired))) { tcp_fastopen_active_disable(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } }
1039 1041 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net) * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi) */ #include <linux/capability.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/stat.h> #include <net/net_namespace.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <net/rose.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/tcp_states.h> #include <net/ip.h> #include <net/arp.h> static int rose_ndevs = 10; int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0; int sysctl_rose_call_request_timeout = ROSE_DEFAULT_T1; int sysctl_rose_reset_request_timeout = ROSE_DEFAULT_T2; int sysctl_rose_clear_request_timeout = ROSE_DEFAULT_T3; int sysctl_rose_no_activity_timeout = ROSE_DEFAULT_IDLE; int sysctl_rose_ack_hold_back_timeout = ROSE_DEFAULT_HB; int sysctl_rose_routing_control = ROSE_DEFAULT_ROUTING; int sysctl_rose_link_fail_timeout = ROSE_DEFAULT_FAIL_TIMEOUT; int sysctl_rose_maximum_vcs = ROSE_DEFAULT_MAXVC; int sysctl_rose_window_size = ROSE_DEFAULT_WINDOW_SIZE; static HLIST_HEAD(rose_list); static DEFINE_SPINLOCK(rose_list_lock); static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; /* * ROSE network devices are virtual network devices encapsulating ROSE * frames into AX.25 which will be sent through an AX.25 device, so form a * special "super class" of normal net devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key rose_netdev_xmit_lock_key; static struct lock_class_key rose_netdev_addr_lock_key; static void rose_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); } static void rose_set_lockdep_key(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); } /* * Convert a ROSE address into text. */ char *rose2asc(char *buf, const rose_address *addr) { if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 && addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 && addr->rose_addr[4] == 0x00) { strcpy(buf, "*"); } else { sprintf(buf, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF, addr->rose_addr[1] & 0xFF, addr->rose_addr[2] & 0xFF, addr->rose_addr[3] & 0xFF, addr->rose_addr[4] & 0xFF); } return buf; } /* * Compare two ROSE addresses, 0 == equal. */ int rosecmp(const rose_address *addr1, const rose_address *addr2) { int i; for (i = 0; i < 5; i++) if (addr1->rose_addr[i] != addr2->rose_addr[i]) return 1; return 0; } /* * Compare two ROSE addresses for only mask digits, 0 == equal. */ int rosecmpm(const rose_address *addr1, const rose_address *addr2, unsigned short mask) { unsigned int i, j; if (mask > 10) return 1; for (i = 0; i < mask; i++) { j = i / 2; if ((i % 2) != 0) { if ((addr1->rose_addr[j] & 0x0F) != (addr2->rose_addr[j] & 0x0F)) return 1; } else { if ((addr1->rose_addr[j] & 0xF0) != (addr2->rose_addr[j] & 0xF0)) return 1; } } return 0; } /* * Socket removal during an interrupt is now safe. */ static void rose_remove_socket(struct sock *sk) { spin_lock_bh(&rose_list_lock); sk_del_node_init(sk); spin_unlock_bh(&rose_list_lock); } /* * Kill all bound sockets on a broken link layer connection to a * particular neighbour. */ void rose_kill_by_neigh(struct rose_neigh *neigh) { struct sock *s; spin_lock_bh(&rose_list_lock); sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (rose->neighbour == neigh) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); rose->neighbour->use--; rose->neighbour = NULL; } } spin_unlock_bh(&rose_list_lock); } /* * Kill all bound sockets on a dropped device. */ static void rose_kill_by_device(struct net_device *dev) { struct sock *sk, *array[16]; struct rose_sock *rose; bool rescan; int i, cnt; start: rescan = false; cnt = 0; spin_lock_bh(&rose_list_lock); sk_for_each(sk, &rose_list) { rose = rose_sk(sk); if (rose->device == dev) { if (cnt == ARRAY_SIZE(array)) { rescan = true; break; } sock_hold(sk); array[cnt++] = sk; } } spin_unlock_bh(&rose_list_lock); for (i = 0; i < cnt; i++) { sk = array[cnt]; rose = rose_sk(sk); lock_sock(sk); spin_lock_bh(&rose_list_lock); if (rose->device == dev) { rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); if (rose->neighbour) rose->neighbour->use--; netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; } spin_unlock_bh(&rose_list_lock); release_sock(sk); sock_put(sk); cond_resched(); } if (rescan) goto start; } /* * Handle device status changes. */ static int rose_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event != NETDEV_DOWN) return NOTIFY_DONE; switch (dev->type) { case ARPHRD_ROSE: rose_kill_by_device(dev); break; case ARPHRD_AX25: rose_link_device_down(dev); rose_rt_device_down(dev); break; } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void rose_insert_socket(struct sock *sk) { spin_lock_bh(&rose_list_lock); sk_add_node(sk, &rose_list); spin_unlock_bh(&rose_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. */ static struct sock *rose_find_listener(rose_address *addr, ax25_address *call) { struct sock *s; spin_lock_bh(&rose_list_lock); sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (!rosecmp(&rose->source_addr, addr) && !ax25cmp(&rose->source_call, call) && !rose->source_ndigis && s->sk_state == TCP_LISTEN) goto found; } sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (!rosecmp(&rose->source_addr, addr) && !ax25cmp(&rose->source_call, &null_ax25_address) && s->sk_state == TCP_LISTEN) goto found; } s = NULL; found: spin_unlock_bh(&rose_list_lock); return s; } /* * Find a connected ROSE socket given my LCI and device. */ struct sock *rose_find_socket(unsigned int lci, struct rose_neigh *neigh) { struct sock *s; spin_lock_bh(&rose_list_lock); sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (rose->lci == lci && rose->neighbour == neigh) goto found; } s = NULL; found: spin_unlock_bh(&rose_list_lock); return s; } /* * Find a unique LCI for a given device. */ unsigned int rose_new_lci(struct rose_neigh *neigh) { int lci; if (neigh->dce_mode) { for (lci = 1; lci <= sysctl_rose_maximum_vcs; lci++) if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL) return lci; } else { for (lci = sysctl_rose_maximum_vcs; lci > 0; lci--) if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL) return lci; } return 0; } /* * Deferred destroy. */ void rose_destroy_socket(struct sock *); /* * Handler for deferred kills. */ static void rose_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); rose_destroy_socket(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupt users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. */ void rose_destroy_socket(struct sock *sk) { struct sk_buff *skb; rose_remove_socket(sk); rose_stop_heartbeat(sk); rose_stop_idletimer(sk); rose_stop_timer(sk); rose_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* Queue the unaccepted socket for death */ sock_set_flag(skb->sk, SOCK_DEAD); rose_start_heartbeat(skb->sk); rose_sk(skb->sk)->state = ROSE_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ timer_setup(&sk->sk_timer, rose_destroy_timer, 0); sk->sk_timer.expires = jiffies + 10 * HZ; add_timer(&sk->sk_timer); } else sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * ROSE socket object. */ static int rose_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); int opt; if (level != SOL_ROSE) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&opt, optval, sizeof(int))) return -EFAULT; switch (optname) { case ROSE_DEFER: rose->defer = opt ? 1 : 0; return 0; case ROSE_T1: if (opt < 1) return -EINVAL; rose->t1 = opt * HZ; return 0; case ROSE_T2: if (opt < 1) return -EINVAL; rose->t2 = opt * HZ; return 0; case ROSE_T3: if (opt < 1) return -EINVAL; rose->t3 = opt * HZ; return 0; case ROSE_HOLDBACK: if (opt < 1) return -EINVAL; rose->hb = opt * HZ; return 0; case ROSE_IDLE: if (opt < 0) return -EINVAL; rose->idle = opt * 60 * HZ; return 0; case ROSE_QBITINCL: rose->qbitincl = opt ? 1 : 0; return 0; default: return -ENOPROTOOPT; } } static int rose_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); int val = 0; int len; if (level != SOL_ROSE) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case ROSE_DEFER: val = rose->defer; break; case ROSE_T1: val = rose->t1 / HZ; break; case ROSE_T2: val = rose->t2 / HZ; break; case ROSE_T3: val = rose->t3 / HZ; break; case ROSE_HOLDBACK: val = rose->hb / HZ; break; case ROSE_IDLE: val = rose->idle / (60 * HZ); break; case ROSE_QBITINCL: val = rose->qbitincl; break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, len, sizeof(int)); if (put_user(len, optlen)) return -EFAULT; return copy_to_user(optval, &val, len) ? -EFAULT : 0; } static int rose_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { release_sock(sk); return -EINVAL; } if (sk->sk_state != TCP_LISTEN) { struct rose_sock *rose = rose_sk(sk); rose->dest_ndigis = 0; memset(&rose->dest_addr, 0, ROSE_ADDR_LEN); memset(&rose->dest_call, 0, AX25_ADDR_LEN); memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; release_sock(sk); return 0; } release_sock(sk); return -EOPNOTSUPP; } static struct proto rose_proto = { .name = "ROSE", .owner = THIS_MODULE, .obj_size = sizeof(struct rose_sock), }; static int rose_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct rose_sock *rose; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern); if (sk == NULL) return -ENOMEM; rose = rose_sk(sk); sock_init_data(sock, sk); skb_queue_head_init(&rose->ack_queue); #ifdef M_BIT skb_queue_head_init(&rose->frag_queue); rose->fraglen = 0; #endif sock->ops = &rose_proto_ops; sk->sk_protocol = protocol; timer_setup(&rose->timer, NULL, 0); timer_setup(&rose->idletimer, NULL, 0); rose->t1 = msecs_to_jiffies(sysctl_rose_call_request_timeout); rose->t2 = msecs_to_jiffies(sysctl_rose_reset_request_timeout); rose->t3 = msecs_to_jiffies(sysctl_rose_clear_request_timeout); rose->hb = msecs_to_jiffies(sysctl_rose_ack_hold_back_timeout); rose->idle = msecs_to_jiffies(sysctl_rose_no_activity_timeout); rose->state = ROSE_STATE_0; return 0; } static struct sock *rose_make_new(struct sock *osk) { struct sock *sk; struct rose_sock *rose, *orose; if (osk->sk_type != SOCK_SEQPACKET) return NULL; sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0); if (sk == NULL) return NULL; rose = rose_sk(sk); sock_init_data(NULL, sk); skb_queue_head_init(&rose->ack_queue); #ifdef M_BIT skb_queue_head_init(&rose->frag_queue); rose->fraglen = 0; #endif sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); timer_setup(&rose->timer, NULL, 0); timer_setup(&rose->idletimer, NULL, 0); orose = rose_sk(osk); rose->t1 = orose->t1; rose->t2 = orose->t2; rose->t3 = orose->t3; rose->hb = orose->hb; rose->idle = orose->idle; rose->defer = orose->defer; rose->device = orose->device; if (rose->device) netdev_hold(rose->device, &rose->dev_tracker, GFP_ATOMIC); rose->qbitincl = orose->qbitincl; return sk; } static int rose_release(struct socket *sock) { struct sock *sk = sock->sk; struct rose_sock *rose; if (sk == NULL) return 0; sock_hold(sk); sock_orphan(sk); lock_sock(sk); rose = rose_sk(sk); switch (rose->state) { case ROSE_STATE_0: release_sock(sk); rose_disconnect(sk, 0, -1, -1); lock_sock(sk); rose_destroy_socket(sk); break; case ROSE_STATE_2: rose->neighbour->use--; release_sock(sk); rose_disconnect(sk, 0, -1, -1); lock_sock(sk); rose_destroy_socket(sk); break; case ROSE_STATE_1: case ROSE_STATE_3: case ROSE_STATE_4: case ROSE_STATE_5: rose_clear_queues(sk); rose_stop_idletimer(sk); rose_write_internal(sk, ROSE_CLEAR_REQUEST); rose_start_t3timer(sk); rose->state = ROSE_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; default: break; } spin_lock_bh(&rose_list_lock); netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; spin_unlock_bh(&rose_list_lock); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; struct net_device *dev; ax25_address *source; ax25_uid_assoc *user; int n; if (!sock_flag(sk, SOCK_ZAPPED)) return -EINVAL; if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; if (addr->srose_family != AF_ROSE) return -EINVAL; if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) return -EINVAL; if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) return -EINVAL; if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) return -EADDRNOTAVAIL; source = &addr->srose_call; user = ax25_findbyuid(current_euid()); if (user) { rose->source_call = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); return -EACCES; } rose->source_call = *source; } rose->source_addr = addr->srose_addr; rose->device = dev; netdev_tracker_alloc(rose->device, &rose->dev_tracker, GFP_KERNEL); rose->source_ndigis = addr->srose_ndigis; if (addr_len == sizeof(struct full_sockaddr_rose)) { struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; for (n = 0 ; n < addr->srose_ndigis ; n++) rose->source_digis[n] = full_addr->srose_digis[n]; } else { if (rose->source_ndigis == 1) { rose->source_digis[0] = addr->srose_digi; } } rose_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); return 0; } static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; unsigned char cause, diagnostic; ax25_uid_assoc *user; int n, err = 0; if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; if (addr->srose_family != AF_ROSE) return -EINVAL; if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) return -EINVAL; if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) return -EINVAL; /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */ if ((rose->source_ndigis + addr->srose_ndigis) > ROSE_MAX_DIGIS) return -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { /* Connect completed during a ERESTARTSYS event */ sock->state = SS_CONNECTED; goto out_release; } if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; err = -ECONNREFUSED; goto out_release; } if (sk->sk_state == TCP_ESTABLISHED) { /* No reconnect on a seqpacket socket */ err = -EISCONN; goto out_release; } sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause, &diagnostic, 0); if (!rose->neighbour) { err = -ENETUNREACH; goto out_release; } rose->lci = rose_new_lci(rose->neighbour); if (!rose->lci) { err = -ENETUNREACH; goto out_release; } if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ struct net_device *dev; sock_reset_flag(sk, SOCK_ZAPPED); dev = rose_dev_first(); if (!dev) { err = -ENETUNREACH; goto out_release; } user = ax25_findbyuid(current_euid()); if (!user) { err = -EINVAL; dev_put(dev); goto out_release; } memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN); rose->source_call = user->call; rose->device = dev; netdev_tracker_alloc(rose->device, &rose->dev_tracker, GFP_KERNEL); ax25_uid_put(user); rose_insert_socket(sk); /* Finish the bind */ } rose->dest_addr = addr->srose_addr; rose->dest_call = addr->srose_call; rose->rand = ((long)rose & 0xFFFF) + rose->lci; rose->dest_ndigis = addr->srose_ndigis; if (addr_len == sizeof(struct full_sockaddr_rose)) { struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; for (n = 0 ; n < addr->srose_ndigis ; n++) rose->dest_digis[n] = full_addr->srose_digis[n]; } else { if (rose->dest_ndigis == 1) { rose->dest_digis[0] = addr->srose_digi; } } /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; rose->state = ROSE_STATE_1; rose->neighbour->use++; rose_write_internal(sk, ROSE_CALL_REQUEST); rose_start_heartbeat(sk); rose_start_t1timer(sk); /* Now the loop */ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { err = -EINPROGRESS; goto out_release; } /* * A Connect Ack with Choke or timeout or failed routing will go to * closed. */ if (sk->sk_state == TCP_SYN_SENT) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); /* Always set at this point */ goto out_release; } sock->state = SS_CONNECTED; out_release: release_sock(sk); return err; } static int rose_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; DEFINE_WAIT(wait); struct sock *sk; int err = 0; if ((sk = sock->sk) == NULL) return -EINVAL; lock_sock(sk); if (sk->sk_type != SOCK_SEQPACKET) { err = -EOPNOTSUPP; goto out_release; } if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; goto out_release; } /* * The write queue this time is holding sockets ready to use * hooked into the SABM we saved */ for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); out_release: release_sock(sk); return err; } static int rose_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr; struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); int n; memset(srose, 0, sizeof(*srose)); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; srose->srose_family = AF_ROSE; srose->srose_addr = rose->dest_addr; srose->srose_call = rose->dest_call; srose->srose_ndigis = rose->dest_ndigis; for (n = 0; n < rose->dest_ndigis; n++) srose->srose_digis[n] = rose->dest_digis[n]; } else { srose->srose_family = AF_ROSE; srose->srose_addr = rose->source_addr; srose->srose_call = rose->source_call; srose->srose_ndigis = rose->source_ndigis; for (n = 0; n < rose->source_ndigis; n++) srose->srose_digis[n] = rose->source_digis[n]; } return sizeof(struct full_sockaddr_rose); } int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci) { struct sock *sk; struct sock *make; struct rose_sock *make_rose; struct rose_facilities_struct facilities; int n; skb->sk = NULL; /* Initially we don't know who it's for */ /* * skb->data points to the rose frame start */ memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, skb->len - ROSE_CALL_REQ_FACILITIES_OFF, &facilities)) { rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76); return 0; } sk = rose_find_listener(&facilities.source_addr, &facilities.source_call); /* * We can't accept the Call Request. */ if (sk == NULL || sk_acceptq_is_full(sk) || (make = rose_make_new(sk)) == NULL) { rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120); return 0; } skb->sk = make; make->sk_state = TCP_ESTABLISHED; make_rose = rose_sk(make); make_rose->lci = lci; make_rose->dest_addr = facilities.dest_addr; make_rose->dest_call = facilities.dest_call; make_rose->dest_ndigis = facilities.dest_ndigis; for (n = 0 ; n < facilities.dest_ndigis ; n++) make_rose->dest_digis[n] = facilities.dest_digis[n]; make_rose->source_addr = facilities.source_addr; make_rose->source_call = facilities.source_call; make_rose->source_ndigis = facilities.source_ndigis; for (n = 0 ; n < facilities.source_ndigis ; n++) make_rose->source_digis[n] = facilities.source_digis[n]; make_rose->neighbour = neigh; make_rose->device = dev; /* Caller got a reference for us. */ netdev_tracker_alloc(make_rose->device, &make_rose->dev_tracker, GFP_ATOMIC); make_rose->facilities = facilities; make_rose->neighbour->use++; if (rose_sk(sk)->defer) { make_rose->state = ROSE_STATE_5; } else { rose_write_internal(make, ROSE_CALL_ACCEPTED); make_rose->state = ROSE_STATE_3; rose_start_idletimer(make); } make_rose->condition = 0x00; make_rose->vs = 0; make_rose->va = 0; make_rose->vr = 0; make_rose->vl = 0; sk_acceptq_added(sk); rose_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); rose_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 1; } static int rose_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); DECLARE_SOCKADDR(struct sockaddr_rose *, usrose, msg->msg_name); int err; struct full_sockaddr_rose srose; struct sk_buff *skb; unsigned char *asmptr; int n, size, qbit = 0; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) return -EADDRNOTAVAIL; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); return -EPIPE; } if (rose->neighbour == NULL || rose->device == NULL) return -ENETUNREACH; if (usrose != NULL) { if (msg->msg_namelen != sizeof(struct sockaddr_rose) && msg->msg_namelen != sizeof(struct full_sockaddr_rose)) return -EINVAL; memset(&srose, 0, sizeof(struct full_sockaddr_rose)); memcpy(&srose, usrose, msg->msg_namelen); if (rosecmp(&rose->dest_addr, &srose.srose_addr) != 0 || ax25cmp(&rose->dest_call, &srose.srose_call) != 0) return -EISCONN; if (srose.srose_ndigis != rose->dest_ndigis) return -EISCONN; if (srose.srose_ndigis == rose->dest_ndigis) { for (n = 0 ; n < srose.srose_ndigis ; n++) if (ax25cmp(&rose->dest_digis[n], &srose.srose_digis[n])) return -EISCONN; } if (srose.srose_family != AF_ROSE) return -EINVAL; } else { if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; srose.srose_family = AF_ROSE; srose.srose_addr = rose->dest_addr; srose.srose_call = rose->dest_call; srose.srose_ndigis = rose->dest_ndigis; for (n = 0 ; n < rose->dest_ndigis ; n++) srose.srose_digis[n] = rose->dest_digis[n]; } /* Build a packet */ /* Sanity check the packet size */ if (len > 65535) return -EMSGSIZE; size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) return err; skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN); /* * Put the data on the end */ skb_reset_transport_header(skb); skb_put(skb, len); err = memcpy_from_msg(skb_transport_header(skb), msg, len); if (err) { kfree_skb(skb); return err; } /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (rose->qbitincl) { qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the ROSE header */ asmptr = skb_push(skb, ROSE_MIN_LEN); /* Build a ROSE Network header */ asmptr[0] = ((rose->lci >> 8) & 0x0F) | ROSE_GFI; asmptr[1] = (rose->lci >> 0) & 0xFF; asmptr[2] = ROSE_DATA; if (qbit) asmptr[0] |= ROSE_Q_BIT; if (sk->sk_state != TCP_ESTABLISHED) { kfree_skb(skb); return -ENOTCONN; } #ifdef M_BIT #define ROSE_PACLEN (256-ROSE_MIN_LEN) if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) { unsigned char header[ROSE_MIN_LEN]; struct sk_buff *skbn; int frontlen; int lg; /* Save a copy of the Header */ skb_copy_from_linear_data(skb, header, ROSE_MIN_LEN); skb_pull(skb, ROSE_MIN_LEN); frontlen = skb_headroom(skb); while (skb->len > 0) { if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, &err)) == NULL) { kfree_skb(skb); return err; } skbn->sk = sk; skbn->free = 1; skbn->arp = 1; skb_reserve(skbn, frontlen); lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; /* Copy the user data */ skb_copy_from_linear_data(skb, skb_put(skbn, lg), lg); skb_pull(skb, lg); /* Duplicate the Header */ skb_push(skbn, ROSE_MIN_LEN); skb_copy_to_linear_data(skbn, header, ROSE_MIN_LEN); if (skb->len > 0) skbn->data[2] |= M_BIT; skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */ } skb->free = 1; kfree_skb(skb); } else { skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */ } #else skb_queue_tail(&sk->sk_write_queue, skb); /* Shove it onto the queue */ #endif rose_kick(sk); return len; } static int rose_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); size_t copied; unsigned char *asmptr; struct sk_buff *skb; int n, er, qbit; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; /* Now we can treat all alike */ skb = skb_recv_datagram(sk, flags, &er); if (!skb) return er; qbit = (skb->data[0] & ROSE_Q_BIT) == ROSE_Q_BIT; skb_pull(skb, ROSE_MIN_LEN); if (rose->qbitincl) { asmptr = skb_push(skb, 1); *asmptr = qbit; } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { struct sockaddr_rose *srose; DECLARE_SOCKADDR(struct full_sockaddr_rose *, full_srose, msg->msg_name); memset(msg->msg_name, 0, sizeof(struct full_sockaddr_rose)); srose = msg->msg_name; srose->srose_family = AF_ROSE; srose->srose_addr = rose->dest_addr; srose->srose_call = rose->dest_call; srose->srose_ndigis = rose->dest_ndigis; for (n = 0 ; n < rose->dest_ndigis ; n++) full_srose->srose_digis[n] = rose->dest_digis[n]; msg->msg_namelen = sizeof(struct full_sockaddr_rose); } skb_free_datagram(sk, skb); return copied; } static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); void __user *argp = (void __user *)arg; switch (cmd) { case TIOCOUTQ: { long amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; return put_user(amount, (unsigned int __user *) argp); } case TIOCINQ: { struct sk_buff *skb; long amount = 0L; spin_lock_irq(&sk->sk_receive_queue.lock); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; spin_unlock_irq(&sk->sk_receive_queue.lock); return put_user(amount, (unsigned int __user *) argp); } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCRSCLRRT: if (!capable(CAP_NET_ADMIN)) return -EPERM; return rose_rt_ioctl(cmd, argp); case SIOCRSGCAUSE: { struct rose_cause_struct rose_cause; rose_cause.cause = rose->cause; rose_cause.diagnostic = rose->diagnostic; return copy_to_user(argp, &rose_cause, sizeof(struct rose_cause_struct)) ? -EFAULT : 0; } case SIOCRSSCAUSE: { struct rose_cause_struct rose_cause; if (copy_from_user(&rose_cause, argp, sizeof(struct rose_cause_struct))) return -EFAULT; rose->cause = rose_cause.cause; rose->diagnostic = rose_cause.diagnostic; return 0; } case SIOCRSSL2CALL: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) ax25_listen_release(&rose_callsign, NULL); if (copy_from_user(&rose_callsign, argp, sizeof(ax25_address))) return -EFAULT; if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) return ax25_listen_register(&rose_callsign, NULL); return 0; case SIOCRSGL2CALL: return copy_to_user(argp, &rose_callsign, sizeof(ax25_address)) ? -EFAULT : 0; case SIOCRSACCEPT: if (rose->state == ROSE_STATE_5) { rose_write_internal(sk, ROSE_CALL_ACCEPTED); rose_start_idletimer(sk); rose->condition = 0x00; rose->vs = 0; rose->va = 0; rose->vr = 0; rose->vl = 0; rose->state = ROSE_STATE_3; } return 0; default: return -ENOIOCTLCMD; } return 0; } #ifdef CONFIG_PROC_FS static void *rose_info_start(struct seq_file *seq, loff_t *pos) __acquires(rose_list_lock) { spin_lock_bh(&rose_list_lock); return seq_hlist_start_head(&rose_list, *pos); } static void *rose_info_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &rose_list, pos); } static void rose_info_stop(struct seq_file *seq, void *v) __releases(rose_list_lock) { spin_unlock_bh(&rose_list_lock); } static int rose_info_show(struct seq_file *seq, void *v) { char buf[11], rsbuf[11]; if (v == SEQ_START_TOKEN) seq_puts(seq, "dest_addr dest_call src_addr src_call dev lci neigh st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n"); else { struct sock *s = sk_entry(v); struct rose_sock *rose = rose_sk(s); const char *devname, *callsign; const struct net_device *dev = rose->device; if (!dev) devname = "???"; else devname = dev->name; seq_printf(seq, "%-10s %-9s ", rose2asc(rsbuf, &rose->dest_addr), ax2asc(buf, &rose->dest_call)); if (ax25cmp(&rose->source_call, &null_ax25_address) == 0) callsign = "??????-?"; else callsign = ax2asc(buf, &rose->source_call); seq_printf(seq, "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", rose2asc(rsbuf, &rose->source_addr), callsign, devname, rose->lci & 0x0FFF, (rose->neighbour) ? rose->neighbour->number : 0, rose->state, rose->vs, rose->vr, rose->va, ax25_display_timer(&rose->timer) / HZ, rose->t1 / HZ, rose->t2 / HZ, rose->t3 / HZ, rose->hb / HZ, ax25_display_timer(&rose->idletimer) / (60 * HZ), rose->idle / (60 * HZ), sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); } return 0; } static const struct seq_operations rose_info_seqops = { .start = rose_info_start, .next = rose_info_next, .stop = rose_info_stop, .show = rose_info_show, }; #endif /* CONFIG_PROC_FS */ static const struct net_proto_family rose_family_ops = { .family = PF_ROSE, .create = rose_create, .owner = THIS_MODULE, }; static const struct proto_ops rose_proto_ops = { .family = PF_ROSE, .owner = THIS_MODULE, .release = rose_release, .bind = rose_bind, .connect = rose_connect, .socketpair = sock_no_socketpair, .accept = rose_accept, .getname = rose_getname, .poll = datagram_poll, .ioctl = rose_ioctl, .gettstamp = sock_gettstamp, .listen = rose_listen, .shutdown = sock_no_shutdown, .setsockopt = rose_setsockopt, .getsockopt = rose_getsockopt, .sendmsg = rose_sendmsg, .recvmsg = rose_recvmsg, .mmap = sock_no_mmap, }; static struct notifier_block rose_dev_notifier = { .notifier_call = rose_device_event, }; static struct net_device **dev_rose; static struct ax25_protocol rose_pid = { .pid = AX25_P_ROSE, .func = rose_route_frame }; static struct ax25_linkfail rose_linkfail_notifier = { .func = rose_link_failed }; static int __init rose_proto_init(void) { int i; int rc; if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) { printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter too large\n"); rc = -EINVAL; goto out; } rc = proto_register(&rose_proto, 0); if (rc != 0) goto out; rose_callsign = null_ax25_address; dev_rose = kcalloc(rose_ndevs, sizeof(struct net_device *), GFP_KERNEL); if (dev_rose == NULL) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n"); rc = -ENOMEM; goto out_proto_unregister; } for (i = 0; i < rose_ndevs; i++) { struct net_device *dev; char name[IFNAMSIZ]; sprintf(name, "rose%d", i); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, rose_setup); if (!dev) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n"); rc = -ENOMEM; goto fail; } rc = register_netdev(dev); if (rc) { printk(KERN_ERR "ROSE: netdevice registration failed\n"); free_netdev(dev); goto fail; } rose_set_lockdep_key(dev); dev_rose[i] = dev; } sock_register(&rose_family_ops); register_netdevice_notifier(&rose_dev_notifier); ax25_register_pid(&rose_pid); ax25_linkfail_register(&rose_linkfail_notifier); #ifdef CONFIG_SYSCTL rose_register_sysctl(); #endif rose_loopback_init(); rose_add_loopback_neigh(); proc_create_seq("rose", 0444, init_net.proc_net, &rose_info_seqops); proc_create_seq("rose_neigh", 0444, init_net.proc_net, &rose_neigh_seqops); proc_create_seq("rose_nodes", 0444, init_net.proc_net, &rose_node_seqops); proc_create_seq("rose_routes", 0444, init_net.proc_net, &rose_route_seqops); out: return rc; fail: while (--i >= 0) { unregister_netdev(dev_rose[i]); free_netdev(dev_rose[i]); } kfree(dev_rose); out_proto_unregister: proto_unregister(&rose_proto); goto out; } module_init(rose_proto_init); module_param(rose_ndevs, int, 0); MODULE_PARM_DESC(rose_ndevs, "number of ROSE devices"); MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio ROSE network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_ROSE); static void __exit rose_exit(void) { int i; remove_proc_entry("rose", init_net.proc_net); remove_proc_entry("rose_neigh", init_net.proc_net); remove_proc_entry("rose_nodes", init_net.proc_net); remove_proc_entry("rose_routes", init_net.proc_net); rose_loopback_clear(); rose_rt_free(); ax25_protocol_release(AX25_P_ROSE); ax25_linkfail_release(&rose_linkfail_notifier); if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) ax25_listen_release(&rose_callsign, NULL); #ifdef CONFIG_SYSCTL rose_unregister_sysctl(); #endif unregister_netdevice_notifier(&rose_dev_notifier); sock_unregister(PF_ROSE); for (i = 0; i < rose_ndevs; i++) { struct net_device *dev = dev_rose[i]; if (dev) { unregister_netdev(dev); free_netdev(dev); } } kfree(dev_rose); proto_unregister(&rose_proto); } module_exit(rose_exit);
4538 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 /* SPDX-License-Identifier: GPL-2.0 */ /* * Filesystem access notification for Linux * * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #ifndef __LINUX_FSNOTIFY_BACKEND_H #define __LINUX_FSNOTIFY_BACKEND_H #ifdef __KERNEL__ #include <linux/idr.h> /* inotify uses this */ #include <linux/fs.h> /* struct inode */ #include <linux/list.h> #include <linux/path.h> /* struct path */ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/mempool.h> #include <linux/sched/mm.h> /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily * convert between them. dnotify only needs conversion at watch creation * so no perf loss there. fanotify isn't defined yet, so it can use the * wholes if it needs more events. */ #define FS_ACCESS 0x00000001 /* File was accessed */ #define FS_MODIFY 0x00000002 /* File was modified */ #define FS_ATTRIB 0x00000004 /* Metadata changed */ #define FS_CLOSE_WRITE 0x00000008 /* Writable file was closed */ #define FS_CLOSE_NOWRITE 0x00000010 /* Unwritable file closed */ #define FS_OPEN 0x00000020 /* File was opened */ #define FS_MOVED_FROM 0x00000040 /* File was moved from X */ #define FS_MOVED_TO 0x00000080 /* File was moved to Y */ #define FS_CREATE 0x00000100 /* Subfile was created */ #define FS_DELETE 0x00000200 /* Subfile was deleted */ #define FS_DELETE_SELF 0x00000400 /* Self was deleted */ #define FS_MOVE_SELF 0x00000800 /* Self was moved */ #define FS_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ #define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */ /* * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify * which does not support FS_ERROR. */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. * Set on inode/sb/mount marks that care about parent/name info. */ #define FS_EVENT_ON_CHILD 0x08000000 #define FS_RENAME 0x10000000 /* File was renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) /* * Directory entry modification events - reported only to directory * where entry is modified and not to a watching parent. * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) /* * This is a list of all events that may get sent to a parent that is watching * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory. */ #define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \ FS_ACCESS | FS_MODIFY | FS_ATTRIB | \ FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \ FS_OPEN | FS_OPEN_EXEC) /* * This is a list of all events that may get sent with the parent inode as the * @to_tell argument of fsnotify(). * It may include events that can be sent to an inode/sb/mount mark, but cannot * be sent to a parent watching children. */ #define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD) /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT) #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS) struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; struct mem_cgroup; /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * * handle_event - main call for a group to handle an fs event * @group: group to notify * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @cookie: inotify rename cookie * @iter_info: array of marks from this group that are interested in the event * * handle_inode_event - simple variant of handle_event() for groups that only * have inode marks and don't have ignore mask * @mark: mark to notify * @mask: event type and flags * @inode: inode that event happened on * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. * Either @inode or @dir must be non-NULL. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group * MUST be holding a reference on each mark and that reference must be * dropped in this function. inotify uses this function to send * userspace messages that marks have been removed. */ struct fsnotify_ops { int (*handle_event)(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; /* * all of the information about the original object we want to now send to * a group. If you want to carry more info from the accessing task to the * listener this structure is where you need to be adding fields. */ struct fsnotify_event { struct list_head list; }; /* * fsnotify group priorities. * Events are sent in order from highest priority to lowest priority. */ enum fsnotify_group_prio { FSNOTIFY_PRIO_NORMAL = 0, /* normal notifiers, no permissions */ FSNOTIFY_PRIO_CONTENT, /* fanotify permission events */ FSNOTIFY_PRIO_PRE_CONTENT, /* fanotify pre-content events */ __FSNOTIFY_PRIO_NUM }; /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. * refcnt on a group is up to the implementor and at any moment if it goes 0 * everything will be cleaned up. */ struct fsnotify_group { const struct fsnotify_ops *ops; /* how this group handles things */ /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. * As an example, the dnotify group will always have a refcnt=1 and that * will never change. Inotify, on the other hand, has a group per * inotify_init() and the refcnt will hit 0 only when that fd has been * closed. */ refcount_t refcnt; /* things with interest in this group */ /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ enum fsnotify_group_prio priority; /* priority for sending events */ bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ #define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ int flags; unsigned int owner_flags; /* stored flags of mark_mutex owner */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t user_waits; /* Number of tasks waiting for user * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { void *private; #ifdef CONFIG_INOTIFY_USER struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY struct fanotify_group_private_data { /* Hash table of events for merge */ struct hlist_head *merge_hash; /* allows a group to block waiting for a userspace response */ struct list_head access_list; wait_queue_head_t access_waitq; int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; mempool_t error_events_pool; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; }; /* * These helpers are used to prevent deadlock when reclaiming inodes with * evictable marks of the same group that is allocating a new mark. */ static inline void fsnotify_group_lock(struct fsnotify_group *group) { mutex_lock(&group->mark_mutex); group->owner_flags = memalloc_nofs_save(); } static inline void fsnotify_group_unlock(struct fsnotify_group *group) { memalloc_nofs_restore(group->owner_flags); mutex_unlock(&group->mark_mutex); } static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) { WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, FSNOTIFY_EVENT_ERROR, }; struct fs_error_report { int error; struct inode *inode; struct super_block *sb; }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; case FSNOTIFY_EVENT_DENTRY: return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: return NULL; } } static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_DENTRY: /* Non const is needed for dget() */ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; default: return NULL; } } static inline const struct path *fsnotify_data_path(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; default: return NULL; } } static inline struct super_block *fsnotify_data_sb(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return ((struct inode *)data)->i_sb; case FSNOTIFY_EVENT_DENTRY: return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: return NULL; } } static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_ERROR: return (struct fs_error_report *) data; default: return NULL; } } /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not * the other way around, because an event can match different watched objects * of the same object type. * For example, both parent and child are watching an object of type inode. */ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_INODE, FSNOTIFY_ITER_TYPE_VFSMOUNT, FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, FSNOTIFY_ITER_TYPE_COUNT }; /* The type of object that a mark is attached to */ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_ANY = -1, FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; static inline bool fsnotify_valid_obj_type(unsigned int obj_type) { return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT); } struct fsnotify_iter_info { struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT]; struct fsnotify_group *current_group; unsigned int report_mask; int srcu_idx; }; static inline bool fsnotify_iter_should_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { return (iter_info->report_mask & (1U << iter_type)); } static inline void fsnotify_iter_set_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { iter_info->report_mask |= (1U << iter_type); } static inline struct fsnotify_mark *fsnotify_iter_mark( struct fsnotify_iter_info *iter_info, int iter_type) { if (fsnotify_iter_should_report_type(iter_info, iter_type)) return iter_info->marks[iter_type]; return NULL; } static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type, struct fsnotify_mark **markp) { while (type < FSNOTIFY_ITER_TYPE_COUNT) { *markp = fsnotify_iter_mark(iter, type); if (*markp) break; type++; } return type; } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \ } FSNOTIFY_ITER_FUNCS(inode, INODE) FSNOTIFY_ITER_FUNCS(parent, PARENT) FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) FSNOTIFY_ITER_FUNCS(sb, SB) #define fsnotify_foreach_iter_type(type) \ for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++) #define fsnotify_foreach_iter_mark_type(iter, mark, type) \ for (type = 0; \ type = fsnotify_iter_step(iter, type, &mark), \ type < FSNOTIFY_ITER_TYPE_COUNT; \ type++) /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this * structure. We destroy this structure when there are no more marks attached * to it. The structure is protected by fsnotify_mark_srcu. */ struct fsnotify_mark_connector { spinlock_t lock; unsigned char type; /* Type of object [lock] */ unsigned char prio; /* Highest priority group */ #define FSNOTIFY_CONN_FLAG_IS_WATCHED 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ union { /* Object pointer [lock] */ void *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; /* * Container for per-sb fsnotify state (sb marks and more). * Attached lazily on first marked object on the sb and freed when killing sb. */ struct fsnotify_sb_info { struct fsnotify_mark_connector __rcu *sb_marks; /* * Number of inode/mount/sb objects that are being watched in this sb. * Note that inodes objects are currently double-accounted. * * The value in watched_objects[prio] is the number of objects that are * watched by groups of priority >= prio, so watched_objects[0] is the * total number of watched objects in this sb. */ atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM]; }; static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) { #ifdef CONFIG_FSNOTIFY return READ_ONCE(sb->s_fsnotify_info); #else return NULL; #endif } static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { return &fsnotify_sb_info(sb)->watched_objects[0]; } /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events * of a type matching mask or only interested in those events. * * These are flushed when an inode is evicted from core and may be flushed * when the inode is modified (as seen by fsnotify_access). Some fsnotify * users (such as dnotify) will flush these when the open fd is closed and not * at inode eviction or modification. * * Text in brackets is showing the lock(s) protecting modifications of a * particular entry. obj_lock means either inode->i_lock or * mnt->mnt_root->d_lock depending on the mark type. */ struct fsnotify_mark { /* Mask this mark is for [mark->lock, group->mark_mutex] */ __u32 mask; /* We hold one for presence in g_list. Also one ref for each 'thing' * in kernel that found and may be using this mark. */ refcount_t refcnt; /* Group this mark is for. Set on mark creation, stable until last ref * is dropped */ struct fsnotify_group *group; /* List of marks by group->marks_list. Also reused for queueing * mark into destroy_list when it's waiting for the end of SRCU period * before it can be freed. [group->mark_mutex] */ struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; /* Events types and flags to ignore [mark->lock, group->mark_mutex] */ __u32 ignore_mask; /* General fsnotify mark flags */ #define FSNOTIFY_MARK_FLAG_ALIVE 0x0001 #define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002 /* inotify mark flags */ #define FSNOTIFY_MARK_FLAG_EXCL_UNLINK 0x0010 #define FSNOTIFY_MARK_FLAG_IN_ONESHOT 0x0020 /* fanotify mark flags */ #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 #define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 #define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; #ifdef CONFIG_FSNOTIFY /* called from the vfs helpers */ /* main fsnotify call to send events */ extern int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie); extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type); extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); extern void fsnotify_sb_free(struct super_block *sb); extern u32 fsnotify_get_cookie(void); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */ if (!(mask & FS_EVENT_ON_CHILD)) return 0; /* * This object might be watched by a mark that cares about parent/name * info, does it care about the specific set of events that can be * reported with parent/name info? */ return mask & FS_EVENTS_POSS_TO_PARENT; } static inline int fsnotify_inode_watches_children(struct inode *inode) { __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask); /* FS_EVENT_ON_CHILD is set if the inode may care */ if (!(parent_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ return parent_mask & FS_EVENTS_POSS_ON_CHILD; } /* * Update the dentry with a flag indicating the interest of its parent to receive * filesystem events when those events happens to this dentry->d_inode. */ static inline void fsnotify_update_flags(struct dentry *dentry) { assert_spin_locked(&dentry->d_lock); /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken * d_lock, the following fsnotify_set_children_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ if (fsnotify_inode_watches_children(dentry->d_parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; } /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ extern struct fsnotify_group *fsnotify_alloc_group( const struct fsnotify_ops *ops, int flags); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); /* group destruction begins, stop queuing new events */ extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); /* Free event from memory */ extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ extern int fsnotify_insert_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *), void (*insert)(struct fsnotify_group *, struct fsnotify_event *)); static inline int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *)) { return fsnotify_insert_event(group, event, merge, NULL); } /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { fsnotify_add_event(group, group->overflow_event, NULL); } static inline bool fsnotify_is_overflow_event(u32 mask) { return mask & FS_Q_OVERFLOW; } static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); return list_empty(&group->notification_list); } extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* Remove event queued in the notification list */ extern void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event); /* functions used to manipulate the marks attached to inodes */ /* * Canonical "ignore mask" including event flags. * * Note the subtle semantic difference from the legacy ->ignored_mask. * ->ignored_mask traditionally only meant which events should be ignored, * while ->ignore_mask also includes flags regarding the type of objects on * which events should be ignored. */ static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark) { __u32 ignore_mask = mark->ignore_mask; /* The event flags in ignore mask take effect */ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) return ignore_mask; /* * Legacy behavior: * - Always ignore events on dir * - Ignore events on child if parent is watching children */ ignore_mask |= FS_ISDIR; ignore_mask &= ~FS_EVENT_ON_CHILD; ignore_mask |= mark->mask & FS_EVENT_ON_CHILD; return ignore_mask; } /* Legacy ignored_mask - only event types to ignore */ static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark) { return mark->ignore_mask & ALL_FSNOTIFY_EVENTS; } /* * Check if mask (or ignore mask) should be applied depending if victim is a * directory and whether it is reported to a watching parent. */ static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir, int iter_type) { /* Should mask be applied to a directory? */ if (is_dir && !(mask & FS_ISDIR)) return false; /* Should mask be applied to a child? */ if (iter_type == FSNOTIFY_ITER_TYPE_PARENT && !(mask & FS_EVENT_ON_CHILD)) return false; return true; } /* * Effective ignore mask taking into account if event victim is a * directory and whether it is reported to a watching parent. */ static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark, bool is_dir, int iter_type) { __u32 ignore_mask = fsnotify_ignored_events(mark); if (!ignore_mask) return 0; /* For non-dir and non-child, no need to consult the event flags */ if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT) return ignore_mask; ignore_mask = fsnotify_ignore_mask(mark); if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type)) return 0; return ignore_mask & ALL_FSNOTIFY_EVENTS; } /* Get mask for calculating object interest taking ignore mask into account */ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark) { __u32 mask = mark->mask; if (!fsnotify_ignored_events(mark)) return mask; /* Interest in FS_MODIFY may be needed for clearing ignore mask */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mask |= FS_MODIFY; /* * If mark is interested in ignoring events on children, the object must * show interest in those events for fsnotify_parent() to notice it. */ return mask | mark->ignore_mask; } /* Get mask of events for a list of marks */ extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn); /* Calculate mask of events for a list of marks */ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group); /* attach the mark to the object */ int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags); int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline struct fsnotify_mark *fsnotify_find_inode_mark( struct inode *inode, struct fsnotify_group *group) { return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group); } /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* detach mark from inode / mount list, group list, drop inode reference */ extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); /* Clear all of the marks of a group attached to a given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); /* run all the marks in a group, and clear all of the vfsmount marks */ static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT); } /* run all the marks in a group, and clear all of the inode marks */ static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE); } /* run all the marks in a group, and clear all of the sn marks */ static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB); } extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } #else static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) { return 0; } static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { return 0; } static inline void __fsnotify_inode_delete(struct inode *inode) {} static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) {} static inline void fsnotify_sb_delete(struct super_block *sb) {} static inline void fsnotify_sb_free(struct super_block *sb) {} static inline void fsnotify_update_flags(struct dentry *dentry) {} static inline u32 fsnotify_get_cookie(void) { return 0; } static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ #endif /* __LINUX_FSNOTIFY_BACKEND_H */
626 692 50 780 778 781 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_BL_H #define _LINUX_LIST_BL_H #include <linux/list.h> #include <linux/bit_spinlock.h> /* * Special version of lists, where head of the list has a lock in the lowest * bit. This is useful for scalable hash tables without increasing memory * footprint overhead. * * For modification operations, the 0 bit of hlist_bl_head->first * pointer must be set. * * With some small modifications, this can easily be adapted to store several * arbitrary bits (not just a single lock bit), if the need arises to store * some fast and compact auxiliary data. */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL #endif #ifdef CONFIG_DEBUG_LIST #define LIST_BL_BUG_ON(x) BUG_ON(x) #else #define LIST_BL_BUG_ON(x) #endif struct hlist_bl_head { struct hlist_bl_node *first; }; struct hlist_bl_node { struct hlist_bl_node *next, **pprev; }; #define INIT_HLIST_BL_HEAD(ptr) \ ((ptr)->first = NULL) static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) { h->next = NULL; h->pprev = NULL; } #define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member) static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) { return !h->pprev; } static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)h->first & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_set_first(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } static inline bool hlist_bl_empty(const struct hlist_bl_head *h) { return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_add_head(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; hlist_bl_set_first(h, n); } static inline void hlist_bl_add_before(struct hlist_bl_node *n, struct hlist_bl_node *next) { struct hlist_bl_node **pprev = next->pprev; n->pprev = pprev; n->next = next; next->pprev = &n->next; /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); } static inline void hlist_bl_add_behind(struct hlist_bl_node *n, struct hlist_bl_node *prev) { n->next = prev->next; n->pprev = &prev->next; prev->next = n; if (n->next) n->next->pprev = &n->next; } static inline void __hlist_bl_del(struct hlist_bl_node *n) { struct hlist_bl_node *next = n->next; struct hlist_bl_node **pprev = n->pprev; LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((unsigned long)next | ((unsigned long)*pprev & LIST_BL_LOCKMASK))); if (next) next->pprev = pprev; } static inline void hlist_bl_del(struct hlist_bl_node *n) { __hlist_bl_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_bl_del_init(struct hlist_bl_node *n) { if (!hlist_bl_unhashed(n)) { __hlist_bl_del(n); INIT_HLIST_BL_NODE(n); } } static inline void hlist_bl_lock(struct hlist_bl_head *b) { bit_spin_lock(0, (unsigned long *)b); } static inline void hlist_bl_unlock(struct hlist_bl_head *b) { __bit_spin_unlock(0, (unsigned long *)b); } static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) { return bit_spin_is_locked(0, (unsigned long *)b); } /** * hlist_bl_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_bl_for_each_entry(tpos, pos, head, member) \ for (pos = hlist_bl_first(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member) \ for (pos = hlist_bl_first(head); \ pos && ({ n = pos->next; 1; }) && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = n) #endif
202 203 201 214 213 158 203 139 110 54 54 54 41 41 7 7 4 7 8 8 8 7 7 41 4 37 41 41 41 41 41 41 41 41 41 61 21 41 2 36 39 39 3 37 41 41 41 41 41 41 41 16 75 21 74 75 10 9 7 9 17 17 17 17 17 54 54 54 7 15 54 54 7 48 8 7 54 1 52 53 53 18 17 17 17 18 18 18 18 18 17 10 10 17 17 18 18 18 17 18 18 18 10 17 17 18 18 18 18 18 2 2 18 18 18 18 18 17 18 18 18 18 18 9 18 9 18 18 4 18 6 16 4 17 158 158 156 6 38 52 7 104 20 20 9 20 17 169 170 170 63 63 14 61 60 20 3 20 9 20 20 20 18 18 9 18 18 51 7 9 150 8 12 12 12 370 368 10 10 10 2 2 13 12 1 12 12 12 5 8 8 8 1 6 1 3 3 1 11 12 110 110 123 124 124 24 110 24 25 110 124 110 24 24 24 124 24 109 110 110 8 64 64 156 64 64 64 64 64 64 133 133 32 31 9 11 11 11 11 119 51 182 25 19 19 19 27 27 27 17 11 11 11 11 11 32 32 32 32 32 32 32 32 122 123 32 32 32 20 8 11 24 24 9 24 25 32 32 32 8 32 8 32 32 99 99 99 60 39 110 110 2 110 110 109 110 51 51 133 133 132 133 133 33 33 54 54 3 53 48 48 371 367 371 4 4 4 4 371 53 365 39 51 51 39 80 372 1 42 42 4 4 4 3 3 3 9 9 3 2 3 1 156 156 156 156 156 155 155 156 101 99 156 101 99 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/atomic.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <net/hotdata.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include "protocol.h" #include "mib.h" #define CREATE_TRACE_POINTS #include <trace/events/mptcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; struct ipv6_pinfo np; }; #endif enum { MPTCP_CMSG_TS = BIT(0), MPTCP_CMSG_INQ = BIT(1), }; static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); static struct net_device mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { return READ_ONCE(msk->wnd_end); } static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_prot == &tcpv6_prot) return &inet6_stream_ops; #endif WARN_ON_ONCE(sk->sk_prot != &tcp_prot); return &inet_stream_ops; } static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock); if (err) return err; msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; subflow->subflow_id = msk->subflow_id++; /* This is the first subflow, always with id 0 */ WRITE_ONCE(subflow->local_id, 0); mptcp_sock_graft(msk->first, sk->sk_socket); iput(SOCK_INODE(ssock)); return 0; } /* If the MPC handshake is not started, returns the first subflow, * eventually allocating it. */ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; int ret; if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL); if (!msk->first) { ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); } return msk->first; } static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); __kfree_skb(skb); } static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size) { WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc, mptcp_sk(sk)->rmem_fwd_alloc + size); } static void mptcp_rmem_charge(struct sock *sk, int size) { mptcp_rmem_fwd_alloc_add(sk, -size); } static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { bool fragstolen; int delta; if (MPTCP_SKB_CB(from)->offset || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; /* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non * negative one */ atomic_add(delta, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, delta); kfree_skb_partial(from, fragstolen); return true; } static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, struct sk_buff *from) { if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) return false; return mptcp_try_coalesce((struct sock *)msk, to, from); } static void __mptcp_rmem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; mptcp_rmem_charge(sk, amount << PAGE_SHIFT); __sk_mem_reduce_allocated(sk, amount); } static void mptcp_rmem_uncharge(struct sock *sk, int size) { struct mptcp_sock *msk = mptcp_sk(sk); int reclaimable; mptcp_rmem_fwd_alloc_add(sk, size); reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); /* see sk_mem_uncharge() for the rationale behind the following schema */ if (unlikely(reclaimable >= PAGE_SIZE)) __mptcp_rmem_reclaim(sk, reclaimable); } static void mptcp_rfree(struct sk_buff *skb) { unsigned int len = skb->truesize; struct sock *sk = skb->sk; atomic_sub(len, &sk->sk_rmem_alloc); mptcp_rmem_uncharge(sk, len); } void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = mptcp_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, skb->truesize); } /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks */ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) { struct sock *sk = (struct sock *)msk; struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } p = &msk->out_of_order_queue.rb_node; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { rb_link_node(&skb->rbnode, NULL, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); msk->ooo_last_skb = skb; goto end; } /* with 2 subflows, adding at end of ooo queue is quite likely * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); return; } /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); parent = &msk->ooo_last_skb->rbnode; p = &parent->rb_right; goto insert; } /* Find place to insert this segment. Handle overlaps on the way. */ parent = NULL; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { p = &parent->rb_left; continue; } if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); return; } if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { /* partial overlap: * | skb | * | skb1 | * continue traversing */ } else { /* skb's seq == skb1's seq and skb covers skb1. * Replace skb1 with skb. */ rb_replace_node(&skb1->rbnode, &skb->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); goto merge_right; } } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); return; } p = &parent->rb_right; } insert: /* Insert segment into RB tree. */ rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); merge_right: /* Remove other segments covered by skb. */ while ((skb1 = skb_rb_next(skb)) != NULL) { if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) break; rb_erase(&skb1->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) msk->ooo_last_skb = skb; end: skb_condense(skb); mptcp_set_owner_r(skb, sk); } static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) { struct mptcp_sock *msk = mptcp_sk(sk); int amt, amount; if (size <= msk->rmem_fwd_alloc) return true; size -= msk->rmem_fwd_alloc; amt = sk_mem_pages(size); amount = amt << PAGE_SHIFT; if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) return false; mptcp_rmem_fwd_alloc_add(sk, amount); return true; } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb, unsigned int offset, size_t copy_len) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; struct sk_buff *tail; bool has_rxtstamp; __skb_unlink(skb, &ssk->sk_receive_queue); skb_ext_reset(skb); skb_orphan(skb); /* try to fetch required memory from subflow */ if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; } has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value */ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; mptcp_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { mptcp_data_queue_ofo(msk, skb); return false; } /* old data, keep it simple and drop the whole pkt, sender * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); drop: mptcp_drop(sk, skb); return false; } static void mptcp_stop_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_retransmit_timer); mptcp_sk(sk)->timer_ival = 0; } static void mptcp_close_wake_up(struct sock *sk) { if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } static void mptcp_check_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ if (mptcp_pending_data_fin_ack(sk)) { WRITE_ONCE(msk->snd_data_fin_enable, 0); switch (sk->sk_state) { case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_FIN_WAIT2); break; case TCP_CLOSING: case TCP_LAST_ACK: mptcp_set_state(sk, TCP_CLOSE); break; } mptcp_close_wake_up(sk); } } /* can be called with no lock acquired */ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) { struct mptcp_sock *msk = mptcp_sk(sk); if (READ_ONCE(msk->rcv_data_fin) && ((1 << inet_sk_state_load(sk)) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) { if (seq) *seq = rcv_data_fin_seq; return true; } } return false; } static void mptcp_set_datafin_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 retransmits; retransmits = min_t(u32, icsk->icsk_retransmits, ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) { mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) { const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? inet_csk(ssk)->icsk_timeout - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) { struct mptcp_subflow_context *subflow; long tout = 0; mptcp_for_each_subflow(mptcp_sk(sk), subflow) tout = max(tout, mptcp_timeout_from_subflow(subflow)); __mptcp_set_timeout(sk, tout); } static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } void __mptcp_subflow_send_ack(struct sock *ssk) { if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); } static void mptcp_subflow_send_ack(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; mptcp_for_each_subflow(msk, subflow) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) { const struct inet_connection_sock *icsk = inet_csk(ssk); u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); const struct tcp_sock *tp = tcp_sk(ssk); return (ack_pending & ICSK_ACK_SCHED) && ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > READ_ONCE(icsk->icsk_ack.rcv_mss)) || (rx_empty && ack_pending & (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int space = __mptcp_space(sk); bool cleanup, rx_empty; cleanup = (space > 0) && (space >= (old_space << 1)); rx_empty = !__mptcp_rmem(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) mptcp_subflow_cleanup_rbuf(ssk); } } static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; bool ret = false; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options * at the subflow level and the msk lock was not held, so this * is the first opportunity to act on the DATA_FIN and change * the msk state. * * If we are caught up to the sequence number of the incoming * DATA_FIN, send the DATA_ACK now and do state transition. If * not caught up, do nothing and let the recv code send DATA_ACK * when catching up. */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ switch (sk->sk_state) { case TCP_ESTABLISHED: mptcp_set_state(sk, TCP_CLOSE_WAIT); break; case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: mptcp_set_state(sk, TCP_CLOSE); break; default: /* Other states not expected */ WARN_ON_ONCE(1); break; } ret = true; if (!__mptcp_check_fallback(msk)) mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; } static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) { if (READ_ONCE(msk->allow_infinite_fallback)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONFALLBACK); mptcp_do_fallback(ssk); } else { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); mptcp_subflow_reset(ssk); } } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; bool done = false; int sk_rbuf; sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) { WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); sk_rbuf = ssk_rbuf; } } pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); do { u32 map_remaining, offset; u32 seq = tp->copied_seq; struct sk_buff *skb; bool fin; /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); if (!skb) { /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), * a different CPU can have already processed the pending * data, stop here or we can enter an infinite loop */ if (!moved) done = true; break; } if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could * collapse them between the dummy map creation and the * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; } offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) { done = true; seq++; } if (offset < skb->len) { size_t len = skb->len - offset; if (tp->urg_data) done = true; if (__mptcp_move_skb(msk, ssk, skb, offset, len)) moved += len; seq += len; if (unlikely(map_remaining < len)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } } else { if (unlikely(!fin)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } sk_eat_skb(ssk, skb); done = true; } WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { done = true; break; } } while (more_data_avail); if (moved > 0) msk->last_data_recv = tcp_jiffies32; *bytes += moved; return done; } static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; bool moved = false; struct rb_node *p; u64 end_seq; p = rb_first(&msk->out_of_order_queue); pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); while (p) { skb = rb_to_skb(p); if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) break; p = rb_next(p); rb_erase(&skb->rbnode, &msk->out_of_order_queue); if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq))) { mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); continue; } end_seq = MPTCP_SKB_CB(skb)->end_seq; tail = skb_peek_tail(&sk->sk_receive_queue); if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; /* skip overlapping data, if any */ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n", MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->bytes_received += end_seq - msk->ack_seq; WRITE_ONCE(msk->ack_seq, end_seq); moved = true; } return moved; } static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int err = sock_error(ssk); int ssk_state; if (!err) return false; /* only propagate errors on fallen-back sockets or * on MPC connect */ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) return false; /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly * destroy the msk as needed. */ ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) mptcp_set_state(sk, ssk_state); WRITE_ONCE(sk->sk_err, -err); /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); sk_error_report(sk); return true; } void __mptcp_error_report(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_for_each_subflow(msk, subflow) if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) break; } /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); } /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but * this is not a good place to change state. Let the workqueue * do it. */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); return moved > 0; } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); int sk_rbuf, ssk_rbuf; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing * more data to the msk receive queue */ if (unlikely(subflow->disposable)) return; ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) sk_rbuf = ssk_rbuf; /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ if (__mptcp_rmem(sk) > sk_rbuf) return; /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); mptcp_data_unlock(sk); } static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) { mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); WRITE_ONCE(msk->allow_infinite_fallback, false); mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); } static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; if (sk->sk_state != TCP_ESTABLISHED) return false; /* attach to msk socket only after we are sure we will deal with it * at close time */ if (sk->sk_socket && !ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_subflow_joined(msk, ssk); mptcp_stop_tout_timer(sk); __mptcp_propagate_sndbuf(sk, ssk); return true; } static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); list_move_tail(&subflow->node, &msk->conn_list); if (!__mptcp_finish_join(msk, ssk)) mptcp_subflow_reset(ssk); unlock_sock_fast(ssk, slow); } } static bool mptcp_rtx_timer_pending(struct sock *sk) { return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); } static void mptcp_reset_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; /* prevent rescheduling on close */ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) return; tout = mptcp_sk(sk)->timer_ival; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } bool mptcp_schedule_work(struct sock *sk) { if (inet_sk_state_load(sk) != TCP_CLOSE && schedule_work(&mptcp_sk(sk)->work)) { /* each subflow already holds a reference to the sk, and the * workqueue is invoked by a subflow, so sk can't go away here. */ sock_hold(sk); return true; } return false; } static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->data_avail)) return mptcp_subflow_tcp_sock(subflow); } return NULL; } static bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) { if (!tcp_skb_can_collapse_to(skb)) return false; /* can collapse only if MPTCP level sequence is in order and this * mapping has not been xmitted yet */ return mpext && mpext->data_seq + mpext->data_len == write_seq && !mpext->frozen; } /* we can append data to the given data frag if: * - there is space available in the backing page_frag * - the data frag tail matches the current page_frag free offset * - the data frag end sequence number matches the current write seq */ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct page_frag *pfrag, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && pfrag->size - pfrag->offset > 0 && pfrag->offset == (df->offset + df->data_len) && df->data_seq + df->data_len == msk->write_seq; } static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); sk_wmem_queued_add(sk, -len); } static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) { int len = dfrag->data_len + dfrag->overhead; list_del(&dfrag->list); dfrag_uncharge(sk, len); put_page(dfrag->page); } /* called under both the msk socket lock and the data lock */ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; u64 snd_una; snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; if (unlikely(dfrag == msk->first_pending)) { /* in recovery mode can see ack after the current snd head */ if (WARN_ON_ONCE(!msk->recovery)) break; WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } dfrag_clear(sk, dfrag); } dfrag = mptcp_rtx_head(sk); if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; /* prevent wrap around in recovery mode */ if (unlikely(delta > dfrag->already_sent)) { if (WARN_ON_ONCE(!msk->recovery)) goto out; if (WARN_ON_ONCE(delta > dfrag->data_len)) goto out; dfrag->already_sent += delta - dfrag->already_sent; } dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); } /* all retransmitted data acked, recovery completed */ if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) msk->recovery = false; out: if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) { if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_rtx_timer(sk); } else { mptcp_reset_rtx_timer(sk); } if (mptcp_pending_data_fin_ack(sk)) mptcp_schedule_work(sk); } static void __mptcp_clean_una_wakeup(struct sock *sk) { lockdep_assert_held_once(&sk->sk_lock.slock); __mptcp_clean_una(sk); mptcp_write_space(sk); } static void mptcp_clean_una_wakeup(struct sock *sk) { mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); mptcp_data_unlock(sk); } static void mptcp_enter_memory_pressure(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool first = true; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (first) tcp_enter_memory_pressure(ssk); sk_stream_moderate_sndbuf(ssk); first = false; } __mptcp_sync_sndbuf(sk); } /* ensure we get enough memory for the frag hdr, beyond some minimal amount of * data */ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), pfrag, sk->sk_allocation))) return true; mptcp_enter_memory_pressure(sk); return false; } static struct mptcp_data_frag * mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, int orig_offset) { int offset = ALIGN(orig_offset, sizeof(long)); struct mptcp_data_frag *dfrag; dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); dfrag->data_len = 0; dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } struct mptcp_sendmsg_info { int mss_now; int size_goal; u16 limit; u16 sent; unsigned int flags; bool data_lock_held; }; static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; mptcp_snd_wnd = window_end - data_seq; avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; } static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) { struct skb_ext *mpext = __skb_ext_alloc(gfp); if (!mpext) return false; __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); return true; } static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { mptcp_enter_memory_pressure(sk); } return NULL; } static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { tcp_skb_entail(ssk, skb); return skb; } tcp_skb_tsorted_anchor_cleanup(skb); kfree_skb(skb); return NULL; } static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; return __mptcp_alloc_tx_skb(sk, ssk, gfp); } /* note: this always recompute the csum on the whole skb, even * if we just appended a single frag. More status info needed */ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) { struct mptcp_ext *mpext = mptcp_get_ext(skb); __wsum csum = ~csum_unfold(mpext->csum); int offset = skb->len - added; mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } static void mptcp_update_infinite_map(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_ext *mpext) { if (!mpext) return; mpext->infinite_map = 1; mpext->data_len = 0; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); mptcp_subflow_ctx(ssk)->send_infinite_map = 0; pr_fallback(msk); mptcp_do_fallback(ssk); } #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; bool can_coalesce = false; bool reuse_skb = true; struct sk_buff *skb; size_t copy; int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); if (WARN_ON_ONCE(info->sent > info->limit || info->limit > dfrag->data_len)) return 0; if (unlikely(!__tcp_can_send(ssk))) return -EAGAIN; /* compute send limit */ if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE)) ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE; info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ mpext = mptcp_get_ext(skb); if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } copy -= skb->len; } else { alloc_skb: skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); if (!skb) return -ENOMEM; i = skb_shinfo(skb)->nr_frags; reuse_skb = false; mpext = mptcp_get_ext(skb); } /* Zero window and all data acked? Probe. */ copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { tcp_remove_empty_skb(ssk); return 0; } zero_window_probe = true; data_seq = snd_una - 1; copy = 1; } copy = min_t(size_t, copy, info->limit - info->sent); if (!sk_wmem_schedule(ssk, copy)) { tcp_remove_empty_skb(ssk); return -ENOMEM; } if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(dfrag->page); skb_fill_page_desc(skb, i, dfrag->page, offset, copy); } skb->len += copy; skb->data_len += copy; skb->truesize += copy; sk_wmem_queued_add(ssk, copy); sk_mem_charge(ssk, copy); WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); /* on skb reuse we just need to update the DSS len */ if (reuse_skb) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += copy; goto out; } memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n", mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); if (mptcp_subflow_ctx(ssk)->send_infinite_map) mptcp_update_infinite_map(msk, ssk, mpext); trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ sizeof(struct ipv6hdr) - \ sizeof(struct frag_hdr)) struct subflow_send_info { struct sock *ssk; u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) { if (!subflow->stale) return; subflow->stale = 0; MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); } bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) { if (unlikely(subflow->stale)) { u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); if (subflow->stale_rcv_tstamp == rcv_tstamp) return false; mptcp_subflow_set_active(subflow); } return __mptcp_subflow_active(subflow); } #define SSK_MODE_ACTIVE 0 #define SSK_MODE_BACKUP 1 #define SSK_MODE_MAX 2 /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; u64 linger_time; long tout = 0; /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; send_info[i].linger_time = -1; } mptcp_for_each_subflow(msk, subflow) { bool backup = subflow->backup || subflow->request_bkup; trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !backup; pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); pace = subflow->avg_pacing_rate; if (!pace) continue; } linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); if (linger_time < send_info[backup].linger_time) { send_info[backup].ssk = ssk; send_info[backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; /* According to the blest algorithm, to avoid HoL blocking for the * faster flow, we need to: * - estimate the faster flow linger time * - use the above to estimate the amount of byte transferred * by the faster flow * - check that the amount of queued data is greter than the above, * otherwise do not use the picked, slower, subflow * We select the subflow with the shorter estimated time to flush * the queued mem, which basically ensure the above. We just need * to check that subflow has a non empty cwin. */ ssk = send_info[SSK_MODE_ACTIVE].ssk; if (!ssk || !sk_stream_memory_free(ssk)) return NULL; burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); if (!burst) return ssk; subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); msk->snd_burst = burst; return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) { tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); } static void mptcp_update_post_push(struct mptcp_sock *msk, struct mptcp_data_frag *dfrag, u32 sent) { u64 snd_nxt_new = dfrag->data_seq; dfrag->already_sent += sent; msk->snd_burst -= sent; snd_nxt_new += dfrag->already_sent; /* snd_nxt_new can be smaller than snd_nxt in case mptcp * is recovering after a failover. In that event, this re-sends * old segments. * * Thus compute snd_nxt_new candidate based on * the dfrag->data_seq that was sent and the data * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ if (likely(after64(snd_nxt_new, msk->snd_nxt))) { msk->bytes_sent += snd_nxt_new - msk->snd_nxt; WRITE_ONCE(msk->snd_nxt, snd_nxt_new); } } void mptcp_check_and_set_pending(struct sock *sk) { if (mptcp_send_head(sk)) { mptcp_data_lock(sk); mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING); mptcp_data_unlock(sk); } } static int __subflow_push_pending(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dfrag; int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { info->sent = dfrag->already_sent; info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { err = copied ? : ret; goto out; } info->sent += ret; copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); if (msk->snd_burst <= 0 || !sk_stream_memory_free(ssk) || !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { err = copied; goto out; } mptcp_set_timeout(sk); } err = copied; out: if (err > 0) msk->last_data_sent = tcp_jiffies32; return err; } void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .flags = flags, }; bool do_check_data_fin = false; int push_count = 1; while (mptcp_send_head(sk) && (push_count > 0)) { struct mptcp_subflow_context *subflow; int ret = 0; if (mptcp_sched_get_send(msk)) break; push_count = 0; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); prev_ssk = ssk; ssk = mptcp_subflow_tcp_sock(subflow); if (ssk != prev_ssk) { /* First check. If the ssk has changed since * the last round, release prev_ssk */ if (prev_ssk) mptcp_push_release(prev_ssk, &info); /* Need to lock the new subflow only if different * from the previous one, otherwise we are still * helding the relevant lock */ lock_sock(ssk); } push_count++; ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) { if (ret != -EAGAIN || (1 << ssk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) push_count--; continue; } do_check_data_fin = true; } } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); /* ensure the rtx timer is running */ if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (do_check_data_fin) mptcp_check_send_data_fin(sk); } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .data_lock_held = true, }; bool keep_pushing = true; struct sock *xmit_ssk; int copied = 0; info.flags = 0; while (mptcp_send_head(sk) && keep_pushing) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); int ret = 0; /* check for a different subflow usage only after * spooling the first chunk of data */ if (first) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); first = false; if (ret <= 0) break; copied += ret; continue; } if (mptcp_sched_get_send(msk)) goto out; if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) keep_pushing = false; copied += ret; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { xmit_ssk = mptcp_subflow_tcp_sock(subflow); if (xmit_ssk != ssk) { mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SEND); keep_pushing = false; } } } } out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock() */ if (copied) { tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (msk->snd_data_fin_enable && msk->snd_nxt + 1 == msk->write_seq) mptcp_schedule_work(sk); } } static int mptcp_disconnect(struct sock *sk, int flags); static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, size_t len, int *copied_syn) { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; int ret; /* on flags based fastopen the mptcp is supposed to create the * first subflow right now. Otherwise we are in the defer_connect * path, and the first subflow must be already present. * Since the defer_connect flag is cleared after the first succsful * fastopen attempt, no need to check for additional subflow status. */ if (msg->msg_flags & MSG_FASTOPEN) { ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); } if (!msk->first) return -EINVAL; ssk = msk->first; lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); msk->fastopening = 0; msg->msg_flags = saved_flags; release_sock(ssk); /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, msg->msg_flags, 1); /* Keep the same behaviour of plain TCP: zero the copied bytes in * case of any error, except timeout or signal */ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { /* The disconnect() op called by tcp_sendmsg_fastopen()/ * __inet_stream_connect() can fail, due to looking check, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ if (!mptcp_disconnect(sk, 0)) sk->sk_socket->state = SS_UNCONNECTED; } inet_clear_bit(DEFER_CONNECT, sk); return ret; } static int do_copy_data_nocache(struct sock *sk, int copy, struct iov_iter *from, char *to) { if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) { return -EFAULT; } return 0; } /* open-code sk_stream_memory_free() plus sent limit computation to * avoid indirect calls in fast-path. * Called under the msk socket lock, so we can avoid a bunch of ONCE * annotations. */ static u32 mptcp_send_limit(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 limit, not_sent; if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) return 0; limit = mptcp_notsent_lowat(sk); if (limit == UINT_MAX) return UINT_MAX; not_sent = msk->write_seq - msk->snd_nxt; if (not_sent >= limit) return 0; return limit - not_sent; } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; int ret = 0; long timeo; /* silently ignore everything else */ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; lock_sock(sk); if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); copied += copied_syn; if (ret == -EINPROGRESS && copied_syn > 0) goto out; else if (ret) goto do_error; } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) goto do_error; } ret = -EPIPE; if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) goto do_error; pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; bool dfrag_collapsed; size_t psize, offset; u32 copy_limit; /* ensure fitting the notsent_lowat() constraint */ copy_limit = mptcp_send_limit(sk); if (!copy_limit) goto wait_for_memory; /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); frag_truesize = dfrag->overhead; } /* we do not bound vs wspace, to allow a single packet. * memory accounting will prevent execessive memory usage * anyway */ offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); psize = min_t(size_t, psize, copy_limit); total_ts = psize + frag_truesize; if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; ret = do_copy_data_nocache(sk, psize, &msg->msg_iter, page_address(dfrag->page) + offset); if (ret) goto do_error; /* data successfully copied into the write queue */ sk_forward_alloc_add(sk, -total_ts); copied += psize; dfrag->data_len += psize; frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk */ sk_wmem_queued_add(sk, frag_truesize); if (!dfrag_collapsed) { get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); } pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); continue; wait_for_memory: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto do_error; } if (copied) __mptcp_push_pending(sk, msg->msg_flags); out: release_sock(sk); return copied; do_error: if (copied) goto out; copied = sk_stream_error(sk, msg->msg_flags, ret); goto out; } static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct sk_buff *skb, *tmp; int copied = 0; skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); int err; if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, count); if (unlikely(err < 0)) { if (!copied) return err; break; } } if (MPTCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= MPTCP_CMSG_TS; } copied += count; if (count < data_len) { if (!(flags & MSG_PEEK)) { MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->map_seq += count; msk->bytes_consumed += count; } break; } if (!(flags & MSG_PEEK)) { /* we will bulk release the skb memory later */ skb->destructor = NULL; WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); msk->bytes_consumed += count; } if (copied >= len) break; } return copied; } /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. * * Only difference: Use highest rtt estimate of the subflows in use. */ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; msk_owned_by_me(msk); if (copied <= 0) return; if (!msk->rcvspace_init) mptcp_rcv_space_init(msk, msk->first); msk->rcvq_space.copied += copied; mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); rtt_us = msk->rcvq_space.rtt_us; if (rtt_us && time < (rtt_us >> 3)) return; rtt_us = 0; mptcp_for_each_subflow(msk, subflow) { const struct tcp_sock *tp; u64 sf_rtt_us; u32 sf_advmss; tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); sf_advmss = READ_ONCE(tp->advmss); rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { u64 rcvwin, grow; int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; window_clamp = mptcp_win_from_space(sk, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to * the mptcp rx queue fast enough (announced rcv_win can * exceed ssk->sk_rcvbuf). */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk; bool slow; ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } } } msk->rcvq_space.space = msk->rcvq_space.copied; new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; } static void __mptcp_update_rmem(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->rmem_released) return; atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); mptcp_rmem_uncharge(sk, msk->rmem_released); WRITE_ONCE(msk->rmem_released, 0); } static void __mptcp_splice_receive_queue(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); } static bool __mptcp_move_skbs(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; bool ret, done; do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; /* we can have data pending in the subflows only if the msk * receive buffer was full at subflow_data_ready() time, * that is an unlikely slow path. */ if (likely(!ssk)) break; slowpath = lock_sock_fast(ssk); mptcp_data_lock(sk); __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); } while (!done); /* acquire the data lock only if some input data is pending */ ret = moved > 0; if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || !skb_queue_empty_lockless(&sk->sk_receive_queue)) { mptcp_data_lock(sk); __mptcp_update_rmem(sk); ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); } if (ret) mptcp_check_data_fin((struct sock *)msk); return !skb_queue_empty(&msk->receive_queue); } static unsigned int mptcp_inq_hint(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); const struct sk_buff *skb; skb = skb_peek(&msk->receive_queue); if (skb) { u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; if (hint_val >= INT_MAX) return INT_MAX; return (unsigned int)hint_val; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 1; return 0; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; int copied = 0, cmsg_flags = 0; int target; long timeo; /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (unlikely(msk->recvmsg_inq)) cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; goto out_err; } copied += bytes_read; /* be sure to advertise window change */ mptcp_cleanup_rbuf(msk); if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; /* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ if (copied >= target) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || signal_pending(current)) break; } else { if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check */ if (__mptcp_move_skbs(msk)) continue; break; } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } pr_debug("block timeout %ld\n", timeo); mptcp_rcv_space_adjust(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; goto out_err; } } mptcp_rcv_space_adjust(msk, copied); out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (cmsg_flags & MPTCP_CMSG_INQ) { unsigned int inq = mptcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } } pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", msk, skb_queue_empty_lockless(&sk->sk_receive_queue), skb_queue_empty(&msk->receive_queue), copied); release_sock(sk); return copied; } static void mptcp_retransmit_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* we need a process context to retransmit */ if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); } bh_unlock_sock(sk); sock_put(sk); } static void mptcp_tout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); mptcp_schedule_work(sk); sock_put(sk); } /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * * A backup subflow is returned only if that is the only kind available. */ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!__mptcp_subflow_active(subflow)) continue; /* still data outstanding at TCP level? skip this */ if (!tcp_rtx_and_write_queues_empty(ssk)) { mptcp_pm_subflow_chk_stale(msk, ssk); min_stale_count = min_t(int, min_stale_count, subflow->stale_count); continue; } if (subflow->backup || subflow->request_bkup) { if (!backup) backup = ssk; continue; } if (!pick) pick = ssk; } if (pick) return pick; /* use backup only if there are no progresses anywhere */ return min_stale_count > 1 ? backup : NULL; } bool __mptcp_retransmit_pending_data(struct sock *sk) { struct mptcp_data_frag *cur, *rtx_head; struct mptcp_sock *msk = mptcp_sk(sk); if (__mptcp_check_fallback(msk)) return false; /* the closing socket has some data untransmitted and/or unacked: * some data in the mptcp rtx queue has not really xmitted yet. * keep it simple and re-inject the whole mptcp level rtx queue */ mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); rtx_head = mptcp_rtx_head(sk); if (!rtx_head) { mptcp_data_unlock(sk); return false; } msk->recovery_snd_nxt = msk->snd_nxt; msk->recovery = true; mptcp_data_unlock(sk); msk->first_pending = rtx_head; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ list_for_each_entry(cur, &msk->rtx_queue, list) { if (!cur->already_sent) break; cur->already_sent = 0; } return true; } /* flags for __mptcp_close_ssk() */ #define MPTCP_CF_PUSH BIT(1) #define MPTCP_CF_FASTCLOSE BIT(2) /* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches * TCP_CLOSE state */ static void __mptcp_subflow_disconnect(struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || (flags & MPTCP_CF_FASTCLOSE)) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ WARN_ON_ONCE(tcp_disconnect(ssk, 0)); mptcp_subflow_ctx_reset(subflow); } else { tcp_shutdown(ssk, SEND_SHUTDOWN); } } /* subflow sockets can be either outgoing (connect) or incoming * (accept). * * Outgoing subflows use in-kernel sockets. * Incoming subflows do not have their own 'struct socket' allocated, * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); bool dispose_it, need_push = false; /* If the first subflow moved to a close state before accept, e.g. due * to an incoming reset or listener shutdown, the subflow socket is * already deleted by inet_child_forget() and the mptcp socket can't * survive too. */ if (msk->in_accept_queue && msk->first == ssk && (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { /* ensure later check in mptcp_worker() will dispose the msk */ sock_set_flag(sk, SOCK_DEAD); mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); mptcp_subflow_drop_ctx(ssk); goto out_release; } dispose_it = msk->free_first || ssk != msk->first; if (dispose_it) list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { /* be sure to force the tcp_close path * to generate the egress reset */ ssk->sk_lingertime = 0; sock_set_flag(ssk, SOCK_LINGER); subflow->send_fastclose = 1; } need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { __mptcp_subflow_disconnect(ssk, subflow, flags); release_sock(ssk); goto out; } subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops * the ssk has been already destroyed, we just need to release the * reference owned by msk; */ if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } out_release: __mptcp_subflow_error_report(sk, ssk); release_sock(ssk); sock_put(ssk); if (ssk == msk->first) WRITE_ONCE(msk->first, NULL); out: __mptcp_sync_sndbuf(sk); if (need_push) __mptcp_push_pending(sk, 0); /* Catch every 'all subflows closed' scenario, including peers silently * closing them, e.g. due to timeout. * For established sockets, allow an additional timeout before closing, * as the protocol can still create more subflows. */ if (list_is_singular(&msk->conn_list) && msk->first && inet_sk_state_load(msk->first) == TCP_CLOSE) { if (sk->sk_state != TCP_ESTABLISHED || msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) { mptcp_set_state(sk, TCP_CLOSE); mptcp_close_wake_up(sk); } else { mptcp_start_tout_timer(sk); } } } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { /* The first subflow can already be closed and still in the list */ if (subflow->close_event_done) return; subflow->close_event_done = true; if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); /* subflow aborted before reaching the fully_established status * attempt the creation of the next subflow */ mptcp_pm_subflow_check_next(mptcp_sk(sk), subflow); __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) { return 0; } static void __mptcp_close_subflow(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); might_sleep(); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int ssk_state = inet_sk_state_load(ssk); if (ssk_state != TCP_CLOSE && (ssk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED)) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) continue; mptcp_close_ssk(sk, ssk, subflow); } } static bool mptcp_close_tout_expired(const struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp || sk->sk_state == TCP_CLOSE) return false; return time_after32(tcp_jiffies32, inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk)); } static void mptcp_check_fastclose(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; if (likely(!READ_ONCE(msk->rcv_fastclose))) return; mptcp_token_destroy(msk); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { mptcp_send_active_reset_reason(tcp_sk); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); } /* Mirror the tcp_reset() error propagation */ switch (sk->sk_state) { case TCP_SYN_SENT: WRITE_ONCE(sk->sk_err, ECONNREFUSED); break; case TCP_CLOSE_WAIT: WRITE_ONCE(sk->sk_err, EPIPE); break; case TCP_CLOSE: return; default: WRITE_ONCE(sk->sk_err, ECONNRESET); } mptcp_set_state(sk, TCP_CLOSE); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); /* the calling mptcp_worker will properly destroy the socket */ if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; struct sock *ssk; int ret, err; u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_retransmits++; mptcp_set_datafin_timeout(sk); mptcp_send_ack(msk); goto reset_timer; } if (!mptcp_send_head(sk)) return; goto reset_timer; } if (err) goto reset_timer; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { u16 copied = 0; mptcp_subflow_set_scheduled(subflow, false); ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; while (info.sent < info.limit) { ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; info.sent += ret; } if (copied) { len = max(copied, len); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); WRITE_ONCE(msk->allow_infinite_fallback, false); } release_sock(ssk); } } msk->bytes_retrans += len; dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } /* schedule the timeout timer for the relevant event: either close timeout * or mp_fail timeout. The close timeout takes precedence on the mp_fail one */ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) { struct sock *sk = (struct sock *)msk; unsigned long timeout, close_timeout; if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) return; close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + mptcp_close_timeout(sk); /* the close timeout takes precedence on the fail one, and here at least one of * them is active */ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; sk_reset_timer(sk, &sk->sk_timer, timeout); } static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) { struct sock *ssk = msk->first; bool slow; if (!ssk) return; pr_debug("MP_FAIL doesn't respond, reset the subflow\n"); slow = lock_sock_fast(ssk); mptcp_subflow_reset(ssk); WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); unlock_sock_fast(ssk, slow); } static void mptcp_do_fastclose(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, MPTCP_CF_FASTCLOSE); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = (struct sock *)msk; unsigned long fail_tout; int state; lock_sock(sk); state = sk->sk_state; if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; mptcp_check_fastclose(msk); mptcp_pm_nl_work(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(sk); if (mptcp_close_tout_expired(sk)) { mptcp_do_fastclose(sk); mptcp_close_wake_up(sk); } if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); goto unlock; } if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; if (fail_tout && time_after(jiffies, fail_tout)) mptcp_mp_fail_no_response(msk); unlock: release_sock(sk); sock_put(sk); } static void __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; WRITE_ONCE(msk->rmem_fwd_alloc, 0); WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; msk->subflow_id = 1; msk->last_data_sent = tcp_jiffies32; msk->last_data_recv = tcp_jiffies32; msk->last_ack_recv = tcp_jiffies32; mptcp_pm_data_init(msk); /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); } static void mptcp_ca_reset(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_assign_congestion_control(sk); strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name, sizeof(mptcp_sk(sk)->ca_name)); /* no need to keep a reference to the ops, the name will suffice */ tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = NULL; } static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); int ret; __mptcp_init_sock(sk); if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; rcu_read_lock(); ret = mptcp_init_sched(mptcp_sk(sk), mptcp_sched_find(mptcp_get_scheduler(net))); rcu_read_unlock(); if (ret) return ret; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); return 0; } static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (cancel_work_sync(&msk->work)) __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); switch (ssk->sk_state) { case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; fallthrough; case TCP_SYN_SENT: WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { pr_debug("Fallback\n"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); /* simulate the data_fin ack reception to let the state * machine move forward */ WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p\n", ssk); tcp_send_ack(ssk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } break; } release_sock(ssk); } void mptcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); break; case TCP_CLOSE_WAIT: /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state: * MPTCP "accepted" sockets will be created later on. So no * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT. */ break; default: if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); } inet_sk_state_store(sk, state); } static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int mptcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; mptcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n", msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), msk->snd_nxt, msk->write_seq); /* we still need to enqueue subflows or not really shutting down, * skip this */ if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || mptcp_send_head(sk)) return; WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } } static void __mptcp_wr_shutdown(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n", msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, !!mptcp_send_head(sk)); /* will be ignored by fallback sockets */ WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p\n", msk); might_sleep(); mptcp_stop_rtx_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; mptcp_release_sched(msk); sk->sk_prot->destroy(sk); WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sock_put(sk); } void __mptcp_unaccepted_force_close(struct sock *sk) { sock_set_flag(sk, SOCK_DEAD); mptcp_do_fastclose(sk); __mptcp_destroy_sock(sk); } static __poll_t mptcp_check_readable(struct sock *sk) { return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0; } static void mptcp_check_listen_stop(struct sock *sk) { struct sock *ssk; if (inet_sk_state_load(sk) != TCP_LISTEN) return; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); ssk = mptcp_sk(sk)->first; if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) return; lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); tcp_set_state(ssk, TCP_CLOSE); mptcp_subflow_queue_clean(sk, ssk); inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; int subflows_alive = 0; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); goto cleanup; } if (mptcp_data_avail(msk) || timeout < 0) { /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ mptcp_do_fastclose(sk); timeout = 0; } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); } sk_stream_wait_close(sk, timeout); cleanup: /* orphan all the subflows */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); subflows_alive += ssk->sk_state != TCP_CLOSE; /* since the close timeout takes precedence on the fail one, * cancel the latter */ if (ssk == msk->first) subflow->fail_tout = 0; /* detach from the parent socket, but allow data_ready to * push incoming data into the mptcp stack, to properly ack it */ ssk->sk_socket = NULL; ssk->sk_wq = NULL; unlock_sock_fast(ssk, slow); } sock_orphan(sk); /* all the subflows are closed, only timeout can change the msk * state, let's not keep resources busy for no reasons */ if (subflows_alive == 0) mptcp_set_state(sk, TCP_CLOSE); sock_hold(sk); pr_debug("msk=%p state=%d\n", sk, sk->sk_state); mptcp_pm_connection_closed(msk); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { mptcp_start_tout_timer(sk); } return do_cancel_work; } static void mptcp_close(struct sock *sk, long timeout) { bool do_cancel_work; lock_sock(sk); do_cancel_work = __mptcp_close(sk, timeout); release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); struct ipv6_pinfo *msk6 = inet6_sk(msk); msk->sk_v6_daddr = ssk->sk_v6_daddr; msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; if (msk6 && ssk6) { msk6->saddr = ssk6->saddr; msk6->flow_label = ssk6->flow_label; } #endif inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under * msk->firstsocket lock). */ if (msk->fastopening) return -EBUSY; mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_stop_rtx_timer(sk); mptcp_stop_tout_timer(sk); mptcp_pm_connection_closed(msk); /* msk->subflow is still intact, the following will not free the first * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->recovery = false; WRITE_ONCE(msk->can_ack, false); WRITE_ONCE(msk->fully_established, false); WRITE_ONCE(msk->rcv_data_fin, false); WRITE_ONCE(msk->snd_data_fin_enable, false); WRITE_ONCE(msk->rcv_fastclose, false); WRITE_ONCE(msk->use_64bit_ack, false); WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); msk->bytes_consumed = 0; msk->bytes_acked = 0; msk->bytes_received = 0; msk->bytes_sent = 0; msk->bytes_retrans = 0; msk->rcvspace_init = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) { const struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; struct ipv6_pinfo *newnp; newnp = inet6_sk(newsk); rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); if (!opt) net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__); } RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); } #endif static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) { struct ip_options_rcu *inet_opt, *newopt = NULL; const struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; newinet = inet_sk(newsk); rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); if (newopt) memcpy(newopt, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); else net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); rcu_read_unlock(); } struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; if (!nsk) return NULL; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif __mptcp_init_sock(nsk); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) mptcp_copy_ip6_options(nsk, sk); else #endif mptcp_copy_ip_options(nsk, sk); msk = mptcp_sk(nsk); WRITE_ONCE(msk->local_key, subflow_req->local_key); WRITE_ONCE(msk->token, subflow_req->token); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->snd_una, msk->write_seq); WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; sock_reset_flag(nsk, SOCK_RCU_FREE); security_inet_csk_clone(nsk, req); /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space */ mptcp_set_state(nsk, TCP_ESTABLISHED); /* The msk maintain a ref to each subflow in the connections list */ WRITE_ONCE(msk->first, ssk); subflow = mptcp_subflow_ctx(ssk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssk); /* new mpc subflow takes ownership of the newly * created mptcp socket */ mptcp_token_accept(subflow_req, msk); /* set msk addresses early to ensure mptcp_pm_get_local_id() * uses the correct data */ mptcp_copy_inaddrs(nsk, ssk); __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) __mptcp_subflow_fully_established(msk, subflow, mp_opt); bh_unlock_sock(nsk); /* note: the newly allocated socket refcount is 2 now */ return nsk; } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) { const struct tcp_sock *tp = tcp_sk(ssk); msk->rcvspace_init = 1; msk->rcvq_space.copied = 0; msk->rcvq_space.rtt_us = 0; msk->rcvq_space.time = tp->tcp_mstamp; /* initial rcv_space offering made to peer */ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; __mptcp_clear_xmit(sk); /* join list will be eventually flushed (with rst) at sock lock release time */ mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ mptcp_data_lock(sk); skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); WRITE_ONCE(msk->rmem_fwd_alloc, 0); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); mptcp_free_local_addr_list(msk); } static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* allow the following to close even the initial subflow */ msk->free_first = 1; mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } void __mptcp_data_acked(struct sock *sk) { if (!sock_owned_by_user(sk)) __mptcp_clean_una(sk); else __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); } void __mptcp_check_push(struct sock *sk, struct sock *ssk) { if (!mptcp_send_head(sk)) return; if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, false); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); } #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ BIT(MPTCP_FLUSH_JOIN_LIST)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) __must_hold(&sk->sk_lock.slock) { struct mptcp_sock *msk = mptcp_sk(sk); for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); struct list_head join_list; if (!flags) break; INIT_LIST_HEAD(&join_list); list_splice_init(&msk->join_list, &join_list); /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope * 2) must avoid ABBA deadlock with msk socket spinlock: the RX * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); cond_resched(); spin_lock_bh(&sk->sk_lock.slock); } if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); if (unlikely(msk->cb_flags)) { /* be sure to sync the msk state before taking actions * depending on sk_state (MPTCP_ERROR_REPORT) * On sk release avoid actions depending on the first subflow */ if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first) __mptcp_sync_state(sk, msk->pending_state); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) __mptcp_sync_sndbuf(sk); } __mptcp_update_rmem(sk); } /* MP_JOIN client subflow must wait for 4th ack before sending any data: * TCP can't schedule delack timer before the subflow is fully established. * MPTCP uses the delack timer to do 3rd ack retransmissions */ static void schedule_3rdack_retransmission(struct sock *ssk) { struct inet_connection_sock *icsk = inet_csk(ssk); struct tcp_sock *tp = tcp_sk(ssk); unsigned long timeout; if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established)) return; /* reschedule with a timeout above RTT, as we must look only for drop */ if (tp->srtt_us) timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1)); else timeout = TCP_TIMEOUT_INIT; timeout += jiffies; WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = timeout; sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } void mptcp_subflow_process_delegated(struct sock *ssk, long status) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; if (status & BIT(MPTCP_DELEGATE_SEND)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, true); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_SNDBUF)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_sync_sndbuf(sk); else __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_ACK)) schedule_3rdack_retransmission(ssk); } static int mptcp_hash(struct sock *sk) { /* should never be called, * we hash the TCP subflows not the MPTCP socket */ WARN_ON_ONCE(1); return 0; } static void mptcp_unhash(struct sock *sk) { /* called from sk_common_release(), but nothing to do here */ } static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p, ssk=%p\n", msk, msk->first); if (WARN_ON_ONCE(!msk->first)) return -EINVAL; return inet_csk_get_port(msk->first, snum); } void mptcp_finish_connect(struct sock *ssk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; struct sock *sk; subflow = mptcp_subflow_ctx(ssk); sk = subflow->conn; msk = mptcp_sk(sk); pr_debug("msk=%p, token=%u\n", sk, subflow->token); subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); mptcp_pm_new_connection(msk, ssk, 0); } void mptcp_sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; write_unlock_bh(&sk->sk_callback_lock); } bool mptcp_finish_join(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; bool ret = true; pr_debug("msk=%p, subflow=%p\n", msk, subflow); /* mptcp socket already closing? */ if (!mptcp_is_fully_established(parent)) { subflow->reset_reason = MPTCP_RST_EMPTCP; return false; } /* active subflow, already present inside the conn_list */ if (!list_empty(&subflow->node)) { mptcp_subflow_joined(msk, ssk); mptcp_propagate_sndbuf(parent, ssk); return true; } if (!mptcp_pm_allow_new_subflow(msk)) goto err_prohibited; /* If we can't acquire msk socket lock here, let the release callback * handle it */ mptcp_data_lock(parent); if (!sock_owned_by_user(parent)) { ret = __mptcp_finish_join(msk, ssk); if (ret) { sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); } } else { sock_hold(ssk); list_add_tail(&subflow->node, &msk->join_list); __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); } mptcp_data_unlock(parent); if (!ret) { err_prohibited: subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; } return true; } static void mptcp_shutdown(struct sock *sk, int how) { pr_debug("sk=%p, how=%d\n", sk, how); if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) __mptcp_wr_shutdown(sk); } static int mptcp_forward_alloc_get(const struct sock *sk) { return READ_ONCE(sk->sk_forward_alloc) + READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc); } static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) { const struct sock *sk = (void *)msk; u64 delta; if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) return 0; delta = msk->write_seq - v; if (__mptcp_check_fallback(msk) && msk->first) { struct tcp_sock *tp = tcp_sk(msk->first); /* the first subflow is disconnected after close - see * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq * so ignore that status, too. */ if (!((1 << msk->first->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) delta += READ_ONCE(tp->write_seq) - tp->snd_una; } if (delta > INT_MAX) delta = INT_MAX; return (int)delta; } static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) { struct mptcp_sock *msk = mptcp_sk(sk); bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; lock_sock(sk); __mptcp_move_skbs(msk); *karg = mptcp_inq_hint(sk); release_sock(sk); break; case SIOCOUTQ: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); unlock_sock_fast(sk, slow); break; case SIOCOUTQNSD: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, msk->snd_nxt); unlock_sock_fast(sk, slow); break; default: return -ENOIOCTLCMD; } return 0; } static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); int err = -EINVAL; struct sock *ssk; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); mptcp_set_state(sk, TCP_SYN_SENT); subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif if (subflow->request_mptcp) { if (mptcp_active_should_disable(sk)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDISABLED); mptcp_subflow_early_fallback(msk, subflow); } else if (mptcp_token_new_connect(ssk) < 0) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } } WRITE_ONCE(msk->write_seq, subflow->idsn); WRITE_ONCE(msk->snd_nxt, subflow->idsn); WRITE_ONCE(msk->snd_una, subflow->idsn); if (likely(!__mptcp_check_fallback(msk))) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ if (!msk->fastopening) lock_sock(ssk); /* the following mirrors closely a very small chunk of code from * __inet_stream_connect() */ if (ssk->sk_state != TCP_CLOSE) goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); if (err) goto out; } err = ssk->sk_prot->connect(ssk, uaddr, addr_len); if (err < 0) goto out; inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); out: if (!msk->fastopening) release_sock(ssk); /* on successful connect, the msk state will be moved to established by * subflow_finish_connect() */ if (unlikely(err)) { /* avoid leaving a dangling token in an unconnected socket */ mptcp_token_destroy(msk); mptcp_set_state(sk, TCP_CLOSE); return err; } mptcp_copy_inaddrs(sk, ssk); return 0; } static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, .ioctl = mptcp_ioctl, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, .forward_alloc_get = mptcp_forward_alloc_get, .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; int err = -EINVAL; lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } if (sk->sk_family == AF_INET) err = inet_bind_sk(ssk, uaddr, addr_len); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (sk->sk_family == AF_INET6) err = inet6_bind_sk(ssk, uaddr, addr_len); #endif if (!err) mptcp_copy_inaddrs(sk, ssk); unlock: release_sock(sk); return err; } static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; struct sock *ssk; int err; pr_debug("msk=%p\n", msk); lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto unlock; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } mptcp_set_state(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); lock_sock(ssk); err = __inet_listen_sk(ssk, backlog); release_sock(ssk); mptcp_set_state(sk, inet_sk_state_load(ssk)); if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); mptcp_copy_inaddrs(sk, ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); } unlock: release_sock(sk); return err; } static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; pr_debug("msk=%p\n", msk); /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ ssk = READ_ONCE(msk->first); if (!ssk) return -EINVAL; pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk)); newsk = inet_csk_accept(ssk, arg); if (!newsk) return arg->err; pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; /* is_mptcp should be false if subflow->conn is missing, see * subflow_syn_recv_sock() */ if (WARN_ON_ONCE(!new_mptcp_sock)) { tcp_sk(newsk)->is_mptcp = 0; goto tcpfallback; } newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); msk = mptcp_sk(newsk); msk->in_accept_queue = 0; /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { __mptcp_close_ssk(newsk, msk->first, mptcp_subflow_ctx(msk->first), 0); if (unlikely(list_is_singular(&msk->conn_list))) mptcp_set_state(newsk, TCP_CLOSE); } } else { tcpfallback: newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); /* we are being invoked after accepting a non-mp-capable * flow: sk is a tcp_sk, not an mptcp one. * * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ WRITE_ONCE(newsock->sk->sk_socket->ops, mptcp_fallback_tcp_ops(newsock->sk)); } release_sock(newsk); return 0; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */ if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; return 0; } static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; u8 shutdown; int state; msk = mptcp_sk(sk); sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags); if (state == TCP_LISTEN) { struct sock *ssk = READ_ONCE(msk->first); if (WARN_ON_ONCE(!ssk)) return 0; return inet_csk_listen_poll(ssk); } shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(sk); if (shutdown & SEND_SHUTDOWN) mask |= EPOLLOUT | EPOLLWRNORM; else mask |= mptcp_check_writeable(msk); } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; return mask; } static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, }; static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_prot, .ops = &mptcp_stream_ops, .flags = INET_PROTOSW_ICSK, }; static int mptcp_napi_poll(struct napi_struct *napi, int budget) { struct mptcp_delegated_action *delegated; struct mptcp_subflow_context *subflow; int work_done = 0; delegated = container_of(napi, struct mptcp_delegated_action, napi); while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bh_lock_sock_nested(ssk); if (!sock_owned_by_user(ssk)) { mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0)); } else { /* tcp_release_cb_override already processed * the action or will do at next release_sock(). * In both case must dequeue the subflow here - on the same * CPU that scheduled it. */ smp_wmb(); clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status); } bh_unlock_sock(ssk); sock_put(ssk); if (++work_done == budget) return budget; } /* always provide a 0 'work_done' argument, so that napi_complete_done * will not try accessing the NULL napi->dev ptr */ napi_complete_done(napi, 0); return work_done; } void __init mptcp_proto_init(void) { struct mptcp_delegated_action *delegated; int cpu; mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); init_dummy_netdev(&mptcp_napi_dev); for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll); napi_enable(&delegated->napi); } mptcp_subflow_init(); mptcp_pm_init(); mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); inet_register_protosw(&mptcp_protosw); BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static const struct proto_ops mptcp_v6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet6_sendmsg, .recvmsg = inet6_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = mptcp_set_rcvlowat, }; static struct proto mptcp_v6_prot; static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_v6_prot, .ops = &mptcp_v6_stream_ops, .flags = INET_PROTOSW_ICSK, }; int __init mptcp_proto_v6_init(void) { int err; mptcp_v6_prot = mptcp_prot; strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name)); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) return err; err = inet6_register_protosw(&mptcp_v6_protosw); if (err) proto_unregister(&mptcp_v6_prot); return err; } #endif
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ext4 #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_EXT4_H #include <linux/writeback.h> #include <linux/tracepoint.h> struct ext4_allocation_context; struct ext4_allocation_request; struct ext4_extent; struct ext4_prealloc_space; struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct extent_status; struct ext4_fsmap; struct partial_cluster; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) #define show_mballoc_flags(flags) __print_flags(flags, "|", \ { EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \ { EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \ { EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \ { EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \ { EXT4_MB_HINT_BEST, "HINT_BEST" }, \ { EXT4_MB_HINT_DATA, "HINT_DATA" }, \ { EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \ { EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \ { EXT4_MB_HINT_GOAL_ONLY, "HINT_GOAL_ONLY" }, \ { EXT4_MB_HINT_TRY_GOAL, "HINT_TRY_GOAL" }, \ { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ { EXT4_MB_USE_RESERVED, "USE_RESV" }, \ { EXT4_MB_STRICT_CHECK, "STRICT_CHECK" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ { EXT4_GET_BLOCKS_UNWRIT_EXT, "UNWRIT" }, \ { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, "CONVERT_UNWRITTEN" }, \ { EXT4_GET_BLOCKS_ZERO, "ZERO" }, \ { EXT4_GET_BLOCKS_IO_SUBMIT, "IO_SUBMIT" }, \ { EXT4_EX_NOCACHE, "EX_NOCACHE" }) /* * __print_flags() requires that all enum values be wrapped in the * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace * ring buffer. */ TRACE_DEFINE_ENUM(BH_New); TRACE_DEFINE_ENUM(BH_Mapped); TRACE_DEFINE_ENUM(BH_Unwritten); TRACE_DEFINE_ENUM(BH_Boundary); #define show_mflags(flags) __print_flags(flags, "", \ { EXT4_MAP_NEW, "N" }, \ { EXT4_MAP_MAPPED, "M" }, \ { EXT4_MAP_UNWRITTEN, "U" }, \ { EXT4_MAP_BOUNDARY, "B" }) #define show_free_flags(flags) __print_flags(flags, "|", \ { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \ { EXT4_FREE_BLOCKS_FORGET, "FORGET" }, \ { EXT4_FREE_BLOCKS_VALIDATED, "VALIDATED" }, \ { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE, "NO_QUOTA" }, \ { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\ { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER, "LAST_CLUSTER" }) TRACE_DEFINE_ENUM(ES_WRITTEN_B); TRACE_DEFINE_ENUM(ES_UNWRITTEN_B); TRACE_DEFINE_ENUM(ES_DELAYED_B); TRACE_DEFINE_ENUM(ES_HOLE_B); TRACE_DEFINE_ENUM(ES_REFERENCED_B); #define show_extent_status(status) __print_flags(status, "", \ { EXTENT_STATUS_WRITTEN, "W" }, \ { EXTENT_STATUS_UNWRITTEN, "U" }, \ { EXTENT_STATUS_DELAYED, "D" }, \ { EXTENT_STATUS_HOLE, "H" }, \ { EXTENT_STATUS_REFERENCED, "R" }) #define show_falloc_mode(mode) __print_flags(mode, "|", \ { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \ { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \ { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME); TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM); TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT); TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); #define show_fc_reason(reason) \ __print_symbolic(reason, \ { EXT4_FC_REASON_XATTR, "XATTR"}, \ { EXT4_FC_REASON_CROSS_RENAME, "CROSS_RENAME"}, \ { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \ { EXT4_FC_REASON_NOMEM, "NO_MEM"}, \ { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST); TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN); TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW); TRACE_DEFINE_ENUM(CR_ANY_FREE); #define show_criteria(cr) \ __print_symbolic(cr, \ { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" }, \ { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" }, \ { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" }, \ { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" }, \ { CR_ANY_FREE, "CR_ANY_FREE" }) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), TP_ARGS(inode, orig_ino), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, orig_ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u16, mode ) ), TP_fast_assign( __entry->orig_ino = orig_ino; __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->orig_ino, (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid) ); TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u64, blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->blocks = inode->i_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid, __entry->blocks) ); TRACE_EVENT(ext4_request_inode, TP_PROTO(struct inode *dir, int mode), TP_ARGS(dir, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, dir ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, TP_PROTO(struct inode *inode, struct inode *dir, int mode), TP_ARGS(inode, dir, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, dir ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_evict_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, nlink ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nlink = inode->i_nlink; ), TP_printk("dev %d,%d ino %lu nlink %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nlink) ); TRACE_EVENT(ext4_drop_inode, TP_PROTO(struct inode *inode, int drop), TP_ARGS(inode, drop), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, drop ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->drop = drop; ), TP_printk("dev %d,%d ino %lu drop %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->drop) ); TRACE_EVENT(ext4_nfs_commit_metadata, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_mark_inode_dirty, TP_PROTO(struct inode *inode, unsigned long IP), TP_ARGS(inode, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, ip ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ip = IP; ), TP_printk("dev %d,%d ino %lu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_begin_ordered_truncate, TP_PROTO(struct inode *inode, loff_t new_size), TP_ARGS(inode, new_size), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, new_size ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->new_size = new_size; ), TP_printk("dev %d,%d ino %lu new_size %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->new_size) ); DECLARE_EVENT_CLASS(ext4__write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; ), TP_printk("dev %d,%d ino %lu pos %lld len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len) ); DEFINE_EVENT(ext4__write_begin, ext4_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len) ); DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len) ); DECLARE_EVENT_CLASS(ext4__write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) __field( unsigned int, copied ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->copied = copied; ), TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->copied) ); DEFINE_EVENT(ext4__write_end, ext4_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); DEFINE_EVENT(ext4__write_end, ext4_da_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); TRACE_EVENT(ext4_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( long, nr_to_write ) __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) __field( pgoff_t, writeback_index ) __field( int, sync_mode ) __field( char, for_kupdate ) __field( char, range_cyclic ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->range_cyclic = wbc->range_cyclic; ), TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " "range_start %lld range_end %lld sync_mode %d " "for_kupdate %d range_cyclic %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_pages, TP_PROTO(struct inode *inode, pgoff_t first_page, struct writeback_control *wbc), TP_ARGS(inode, first_page, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, first_page ) __field( long, nr_to_write ) __field( int, sync_mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->first_page = first_page; __entry->nr_to_write = wbc->nr_to_write; __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld " "sync_mode %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->first_page, __entry->nr_to_write, __entry->sync_mode) ); TRACE_EVENT(ext4_da_write_pages_extent, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map), TP_ARGS(inode, map), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, lblk ) __field( __u32, len ) __field( __u32, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = map->m_lblk; __entry->len = map->m_len; __entry->flags = map->m_flags; ), TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, show_mflags(__entry->flags)) ); TRACE_EVENT(ext4_writepages_result, TP_PROTO(struct inode *inode, struct writeback_control *wbc, int ret, int pages_written), TP_ARGS(inode, wbc, ret, pages_written), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) __field( int, pages_written ) __field( long, pages_skipped ) __field( pgoff_t, writeback_index ) __field( int, sync_mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " "sync_mode %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->sync_mode, (unsigned long) __entry->writeback_index) ); DECLARE_EVENT_CLASS(ext4__folio_op, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->index = folio->index; ), TP_printk("dev %d,%d ino %lu folio_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index) ); DEFINE_EVENT(ext4__folio_op, ext4_read_folio, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio) ); DEFINE_EVENT(ext4__folio_op, ext4_release_folio, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio) ); DECLARE_EVENT_CLASS(ext4_invalidate_folio_op, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) __field( size_t, offset ) __field( size_t, length ) ), TP_fast_assign( __entry->dev = folio->mapping->host->i_sb->s_dev; __entry->ino = folio->mapping->host->i_ino; __entry->index = folio->index; __entry->offset = offset; __entry->length = length; ), TP_printk("dev %d,%d ino %lu folio_index %lu offset %zu length %zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index, __entry->offset, __entry->length) ); DEFINE_EVENT(ext4_invalidate_folio_op, ext4_invalidate_folio, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length) ); DEFINE_EVENT(ext4_invalidate_folio_op, ext4_journalled_invalidate_folio, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length) ); TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), TP_ARGS(sb, blk, count), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u64, blk ) __field( __u64, count ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->blk = blk; __entry->count = count; ), TP_printk("dev %d,%d blk %llu count %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blk, __entry->count) ); DECLARE_EVENT_CLASS(ext4__mb_new_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u64, pa_lstart ) __field( __u32, pa_len ) ), TP_fast_assign( __entry->dev = ac->ac_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->pa_pstart = pa->pa_pstart; __entry->pa_lstart = pa->pa_lstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa) ); TRACE_EVENT(ext4_mb_release_inode_pa, TP_PROTO(struct ext4_prealloc_space *pa, unsigned long long block, unsigned int count), TP_ARGS(pa, block, count), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( __u32, count ) ), TP_fast_assign( __entry->dev = pa->pa_inode->i_sb->s_dev; __entry->ino = pa->pa_inode->i_ino; __entry->block = block; __entry->count = count; ), TP_printk("dev %d,%d ino %lu block %llu count %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa), TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d pstart %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->pa_pstart, __entry->pa_len) ); TRACE_EVENT(ext4_discard_preallocations, TP_PROTO(struct inode *inode, unsigned int len), TP_ARGS(inode, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->len = len; ), TP_printk("dev %d,%d ino %lu len: %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->len) ); TRACE_EVENT(ext4_mb_discard_preallocations, TP_PROTO(struct super_block *sb, int needed), TP_ARGS(sb, needed), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, needed ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->needed = needed; ), TP_printk("dev %d,%d needed %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->needed) ); TRACE_EVENT(ext4_request_blocks, TP_PROTO(struct ext4_allocation_request *ar), TP_ARGS(ar), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; __entry->lleft = ar->lleft; __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu " "lleft %u lright %u pleft %llu pright %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_allocate_blocks, TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block), TP_ARGS(ar, block), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->block = block; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; __entry->lleft = ar->lleft; __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u " "goal %llu lleft %u lright %u pleft %llu pright %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->block, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_free_blocks, TP_PROTO(struct inode *inode, __u64 block, unsigned long count, int flags), TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned long, count ) __field( int, flags ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; __entry->flags = flags; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->block, __entry->count, show_free_flags(__entry->flags)) ); TRACE_EVENT(ext4_sync_file_enter, TP_PROTO(struct file *file, int datasync), TP_ARGS(file, datasync), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) ), TP_fast_assign( struct dentry *dentry = file->f_path.dentry; __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->datasync = datasync; __entry->parent = d_inode(dentry->d_parent)->i_ino; ), TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) ); TRACE_EVENT(ext4_sync_file_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) ); TRACE_EVENT(ext4_sync_fs, TP_PROTO(struct super_block *sb, int wait), TP_ARGS(sb, wait), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, wait ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->wait = wait; ), TP_printk("dev %d,%d wait %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->wait) ); TRACE_EVENT(ext4_alloc_da_blocks, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, data_blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; ), TP_printk("dev %d,%d ino %lu reserved_data_blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->data_blocks) ); TRACE_EVENT(ext4_mballoc_alloc, TP_PROTO(struct ext4_allocation_context *ac), TP_ARGS(ac), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) __field( int, orig_len ) __field( __u32, goal_logical ) __field( int, goal_start ) __field( __u32, goal_group ) __field( int, goal_len ) __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) __field( __u16, found ) __field( __u16, groups ) __field( __u16, buddy ) __field( __u16, flags ) __field( __u16, tail ) __field( __u8, cr ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; __entry->orig_len = ac->ac_o_ex.fe_len; __entry->goal_logical = ac->ac_g_ex.fe_logical; __entry->goal_start = ac->ac_g_ex.fe_start; __entry->goal_group = ac->ac_g_ex.fe_group; __entry->goal_len = ac->ac_g_ex.fe_len; __entry->result_logical = ac->ac_f_ex.fe_logical; __entry->result_start = ac->ac_f_ex.fe_start; __entry->result_group = ac->ac_f_ex.fe_group; __entry->result_len = ac->ac_f_ex.fe_len; __entry->found = ac->ac_found; __entry->flags = ac->ac_flags; __entry->groups = ac->ac_groups_scanned; __entry->buddy = ac->ac_buddy; __entry->tail = ac->ac_tail; __entry->cr = ac->ac_criteria; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " "result %u/%d/%u@%u blks %u grps %u cr %s flags %s " "tail %u broken %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->goal_group, __entry->goal_start, __entry->goal_len, __entry->goal_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical, __entry->found, __entry->groups, show_criteria(__entry->cr), show_mballoc_flags(__entry->flags), __entry->tail, __entry->buddy ? 1 << __entry->buddy : 0) ); TRACE_EVENT(ext4_mballoc_prealloc, TP_PROTO(struct ext4_allocation_context *ac), TP_ARGS(ac), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) __field( int, orig_len ) __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; __entry->orig_len = ac->ac_o_ex.fe_len; __entry->result_logical = ac->ac_b_ex.fe_logical; __entry->result_start = ac->ac_b_ex.fe_start; __entry->result_group = ac->ac_b_ex.fe_group; __entry->result_len = ac->ac_b_ex.fe_len; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical) ); DECLARE_EVENT_CLASS(ext4__mballoc, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ino = inode ? inode->i_ino : 0; __entry->result_start = start; __entry->result_group = group; __entry->result_len = len; ), TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, __entry->result_len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len) ); TRACE_EVENT(ext4_forget, TP_PROTO(struct inode *inode, int is_metadata, __u64 block), TP_ARGS(inode, is_metadata, block), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( int, is_metadata ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->is_metadata = is_metadata; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->is_metadata, __entry->block) ); TRACE_EVENT(ext4_da_update_reserve_space, TP_PROTO(struct inode *inode, int used_blocks, int quota_claim), TP_ARGS(inode, used_blocks, quota_claim), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) __field( int, quota_claim ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->used_blocks = used_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->quota_claim = quota_claim; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " "reserved_data_blocks %d quota_claim %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->quota_claim) ); TRACE_EVENT(ext4_da_reserve_space, TP_PROTO(struct inode *inode, int nr_resv), TP_ARGS(inode, nr_resv), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, reserve_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->reserve_blocks = nr_resv; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d" "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->reserve_blocks, __entry->reserved_data_blocks) ); TRACE_EVENT(ext4_da_release_space, TP_PROTO(struct inode *inode, int freed_blocks), TP_ARGS(inode, freed_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->freed_blocks = freed_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks) ); DECLARE_EVENT_CLASS(ext4__bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); TRACE_EVENT(ext4_read_block_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch), TP_ARGS(sb, group, prefetch), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) __field( bool, prefetch ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; __entry->prefetch = prefetch; ), TP_printk("dev %d,%d group %u prefetch %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group, __entry->prefetch) ); DECLARE_EVENT_CLASS(ext4__fallocate_mode, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, offset ) __field( loff_t, len ) __field( int, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len, show_falloc_mode(__entry->mode)) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); TRACE_EVENT(ext4_fallocate_exit, TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret), TP_ARGS(inode, offset, max_blocks, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, blocks ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = offset; __entry->blocks = max_blocks; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->blocks, __entry->ret) ); TRACE_EVENT(ext4_unlink_enter, TP_PROTO(struct inode *parent, struct dentry *dentry), TP_ARGS(parent, dentry), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( loff_t, size ) ), TP_fast_assign( __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->parent = parent->i_ino; __entry->size = d_inode(dentry)->i_size; ), TP_printk("dev %d,%d ino %lu size %lld parent %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->size, (unsigned long) __entry->parent) ); TRACE_EVENT(ext4_unlink_exit, TP_PROTO(struct dentry *dentry, int ret), TP_ARGS(dentry, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) ); DECLARE_EVENT_CLASS(ext4__truncate, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->blocks = inode->i_blocks; ), TP_printk("dev %d,%d ino %lu blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->blocks) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* 'ux' is the unwritten extent. */ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux), TP_ARGS(inode, map, ux), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) __field( ext4_fsblk_t, u_pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); __entry->u_len = ext4_ext_get_actual_len(ux); __entry->u_pblk = ext4_ext_pblock(ux); ), TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u " "u_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk) ); /* * 'ux' is the unwritten extent. * 'ix' is the initialized extent to which blocks are transferred. */ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux, struct ext4_extent *ix), TP_ARGS(inode, map, ux, ix), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) __field( ext4_fsblk_t, u_pblk ) __field( ext4_lblk_t, i_lblk ) __field( unsigned, i_len ) __field( ext4_fsblk_t, i_pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); __entry->u_len = ext4_ext_get_actual_len(ux); __entry->u_pblk = ext4_ext_pblock(ux); __entry->i_lblk = le32_to_cpu(ix->ee_block); __entry->i_len = ext4_ext_get_actual_len(ix); __entry->i_pblk = ext4_ext_pblock(ix); ), TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u " "u_lblk %u u_len %u u_pblk %llu " "i_lblk %u i_len %u i_pblk %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk, __entry->i_lblk, __entry->i_len, __entry->i_pblk) ); DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, unsigned int flags), TP_ARGS(inode, lblk, len, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; __entry->flags = flags; ), TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, show_map_flags(__entry->flags)) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned len, unsigned flags), TP_ARGS(inode, lblk, len, flags) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned len, unsigned flags), TP_ARGS(inode, lblk, len, flags) ); DECLARE_EVENT_CLASS(ext4__map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, flags ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, mflags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->flags = flags; __entry->pblk = map->m_pblk; __entry->lblk = map->m_lblk; __entry->len = map->m_len; __entry->mflags = map->m_flags; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u " "mflags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_map_flags(__entry->flags), __entry->lblk, __entry->pblk, __entry->len, show_mflags(__entry->mflags), __entry->ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret) ); TRACE_EVENT(ext4_ext_load_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk), TP_ARGS(inode, lblk, pblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->pblk) ); TRACE_EVENT(ext4_load_inode, TP_PROTO(struct super_block *sb, unsigned long ino), TP_ARGS(sb, ino), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ino = ino; ), TP_printk("dev %d,%d ino %ld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_journal_start_sb, TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks, int revoke_creds, int type, unsigned long IP), TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, type, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned long, ip ) __field( int, blocks ) __field( int, rsv_blocks ) __field( int, revoke_creds ) __field( int, type ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; __entry->rsv_blocks = rsv_blocks; __entry->revoke_creds = revoke_creds; __entry->type = type; ), TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d," " type %d, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds, __entry->type, (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_inode, TP_PROTO(struct inode *inode, int blocks, int rsv_blocks, int revoke_creds, int type, unsigned long IP), TP_ARGS(inode, blocks, rsv_blocks, revoke_creds, type, IP), TP_STRUCT__entry( __field( unsigned long, ino ) __field( dev_t, dev ) __field( unsigned long, ip ) __field( int, blocks ) __field( int, rsv_blocks ) __field( int, revoke_creds ) __field( int, type ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; __entry->rsv_blocks = rsv_blocks; __entry->revoke_creds = revoke_creds; __entry->type = type; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d," " type %d, ino %lu, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds, __entry->type, __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_reserved, TP_PROTO(struct super_block *sb, int blocks, unsigned long IP), TP_ARGS(sb, blocks, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, ip ) __field( int, blocks ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; ), TP_printk("dev %d,%d blocks, %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, (void *)__entry->ip) ); DECLARE_EVENT_CLASS(ext4__trim, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len), TP_STRUCT__entry( __field( int, dev_major ) __field( int, dev_minor ) __field( __u32, group ) __field( int, start ) __field( int, len ) ), TP_fast_assign( __entry->dev_major = MAJOR(sb->s_dev); __entry->dev_minor = MINOR(sb->s_dev); __entry->group = group; __entry->start = start; __entry->len = len; ), TP_printk("dev %d,%d group %u, start %d, len %d", __entry->dev_major, __entry->dev_minor, __entry->group, __entry->start, __entry->len) ); DEFINE_EVENT(ext4__trim, ext4_trim_extent, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len) ); DEFINE_EVENT(ext4__trim, ext4_trim_all_free, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len) ); TRACE_EVENT(ext4_ext_handle_unwritten_extents, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, unsigned int allocated, ext4_fsblk_t newblock), TP_ARGS(inode, map, flags, allocated, newblock), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( unsigned int, allocated ) __field( ext4_fsblk_t, newblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->flags = flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; __entry->allocated = allocated; __entry->newblk = newblock; ), TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s " "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_map_flags(__entry->flags), (unsigned int) __entry->allocated, (unsigned long long) __entry->newblk) ); TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret), TP_ARGS(sb, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( int, ret ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->flags = map->m_flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; __entry->ret = ret; ), TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_mflags(__entry->flags), __entry->ret) ); TRACE_EVENT(ext4_ext_show_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, unsigned short len), TP_ARGS(inode, lblk, pblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned short, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, (unsigned short) __entry->len) ); TRACE_EVENT(ext4_remove_blocks, TP_PROTO(struct inode *inode, struct ext4_extent *ex, ext4_lblk_t from, ext4_fsblk_t to, struct partial_cluster *pc), TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, (unsigned) __entry->from, (unsigned) __entry->to, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_leaf, TP_PROTO(struct inode *inode, ext4_lblk_t start, struct ext4_extent *ex, struct partial_cluster *pc), TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_idx, TP_PROTO(struct inode *inode, ext4_fsblk_t pblk), TP_ARGS(inode, pblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; ), TP_printk("dev %d,%d ino %lu index_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long long) __entry->pblk) ); TRACE_EVENT(ext4_ext_remove_space, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, int depth), TP_ARGS(inode, start, end, depth), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->depth = depth; ), TP_printk("dev %d,%d ino %lu since %u end %u depth %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth) ); TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, int depth, struct partial_cluster *pc, __le16 eh_entries), TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state ) __field( unsigned short, eh_entries ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->depth = depth; __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; __entry->eh_entries = le16_to_cpu(eh_entries); ), TP_printk("dev %d,%d ino %lu since %u end %u depth %d " "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state, (unsigned short) __entry->eh_entries) ); DECLARE_EVENT_CLASS(ext4__es_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es) ); DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es) ); TRACE_EVENT(ext4_es_remove_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len), TP_ARGS(inode, lblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, lblk ) __field( loff_t, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu es [%lld/%lld)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len) ); TRACE_EVENT(ext4_es_find_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_find_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); TRACE_EVENT(ext4_es_lookup_extent_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_lookup_extent_exit, TP_PROTO(struct inode *inode, struct extent_status *es, int found), TP_ARGS(inode, es, found), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( int, found ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->found = found; ), TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->found, __entry->lblk, __entry->len, __entry->found ? __entry->pblk : 0, show_extent_status(__entry->found ? __entry->status : 0)) ); DECLARE_EVENT_CLASS(ext4__es_shrink_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_to_scan ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_to_scan = nr_to_scan; __entry->cache_cnt = cache_cnt; ), TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->cache_cnt) ); DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt) ); DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt) ); TRACE_EVENT(ext4_es_shrink_scan_exit, TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt), TP_ARGS(sb, nr_shrunk, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_shrunk ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_shrunk = nr_shrunk; __entry->cache_cnt = cache_cnt; ), TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, __entry->cache_cnt) ); TRACE_EVENT(ext4_collapse_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len), TP_ARGS(inode, offset, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(loff_t, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len) ); TRACE_EVENT(ext4_insert_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len), TP_ARGS(inode, offset, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(loff_t, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len) ); TRACE_EVENT(ext4_es_shrink, TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, int nr_skipped, int retried), TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_shrunk ) __field( unsigned long long, scan_time ) __field( int, nr_skipped ) __field( int, retried ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_shrunk = nr_shrunk; __entry->scan_time = div_u64(scan_time, 1000); __entry->nr_skipped = nr_skipped; __entry->retried = retried; ), TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu " "nr_skipped %d retried %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); TRACE_EVENT(ext4_es_insert_delayed_extent, TP_PROTO(struct inode *inode, struct extent_status *es, bool lclu_allocated, bool end_allocated), TP_ARGS(inode, es, lclu_allocated, end_allocated), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( bool, lclu_allocated ) __field( bool, end_allocated ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->lclu_allocated = lclu_allocated; __entry->end_allocated = end_allocated; ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " "allocated %d %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), __entry->lclu_allocated, __entry->end_allocated) ); /* fsmap traces */ DECLARE_EVENT_CLASS(ext4_fsmap_class, TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, u64 owner), TP_ARGS(sb, keydev, agno, bno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, keydev) __field(u32, agno) __field(u64, bno) __field(u64, len) __field(u64, owner) ), TP_fast_assign( __entry->dev = sb->s_bdev->bd_dev; __entry->keydev = new_decode_dev(keydev); __entry->agno = agno; __entry->bno = bno; __entry->len = len; __entry->owner = owner; ), TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->agno, __entry->bno, __entry->len, __entry->owner) ) #define DEFINE_FSMAP_EVENT(name) \ DEFINE_EVENT(ext4_fsmap_class, name, \ TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \ u64 owner), \ TP_ARGS(sb, keydev, agno, bno, len, owner)) DEFINE_FSMAP_EVENT(ext4_fsmap_low_key); DEFINE_FSMAP_EVENT(ext4_fsmap_high_key); DEFINE_FSMAP_EVENT(ext4_fsmap_mapping); DECLARE_EVENT_CLASS(ext4_getfsmap_class, TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), TP_ARGS(sb, fsmap), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, keydev) __field(u64, block) __field(u64, len) __field(u64, owner) __field(u64, flags) ), TP_fast_assign( __entry->dev = sb->s_bdev->bd_dev; __entry->keydev = new_decode_dev(fsmap->fmr_device); __entry->block = fsmap->fmr_physical; __entry->len = fsmap->fmr_length; __entry->owner = fsmap->fmr_owner; __entry->flags = fsmap->fmr_flags; ), TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->block, __entry->len, __entry->owner, __entry->flags) ) #define DEFINE_GETFSMAP_EVENT(name) \ DEFINE_EVENT(ext4_getfsmap_class, name, \ TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \ TP_ARGS(sb, fsmap)) DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping); TRACE_EVENT(ext4_shutdown, TP_PROTO(struct super_block *sb, unsigned long flags), TP_ARGS(sb, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned, flags ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->flags = flags; ), TP_printk("dev %d,%d flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags) ); TRACE_EVENT(ext4_error, TP_PROTO(struct super_block *sb, const char *function, unsigned int line), TP_ARGS(sb, function, line), TP_STRUCT__entry( __field( dev_t, dev ) __field( const char *, function ) __field( unsigned, line ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->function = function; __entry->line = line; ), TP_printk("dev %d,%d function %s line %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->function, __entry->line) ); TRACE_EVENT(ext4_prefetch_bitmaps, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_group_t next, unsigned int prefetch_ios), TP_ARGS(sb, group, next, prefetch_ios), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) __field( __u32, next ) __field( __u32, ios ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; __entry->next = next; __entry->ios = prefetch_ios; ), TP_printk("dev %d,%d group %u next %u ios %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group, __entry->next, __entry->ios) ); TRACE_EVENT(ext4_lazy_itable_init, TP_PROTO(struct super_block *sb, ext4_group_t group), TP_ARGS(sb, group), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ); TRACE_EVENT(ext4_fc_replay_scan, TP_PROTO(struct super_block *sb, int error, int off), TP_ARGS(sb, error, off), TP_STRUCT__entry( __field(dev_t, dev) __field(int, error) __field(int, off) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->error = error; __entry->off = off; ), TP_printk("dev %d,%d error %d, off %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->error, __entry->off) ); TRACE_EVENT(ext4_fc_replay, TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2), TP_ARGS(sb, tag, ino, priv1, priv2), TP_STRUCT__entry( __field(dev_t, dev) __field(int, tag) __field(int, ino) __field(int, priv1) __field(int, priv2) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->tag = tag; __entry->ino = ino; __entry->priv1 = priv1; __entry->priv2 = priv2; ), TP_printk("dev %d,%d: tag %d, ino %d, data1 %d, data2 %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tag, __entry->ino, __entry->priv1, __entry->priv2) ); TRACE_EVENT(ext4_fc_commit_start, TP_PROTO(struct super_block *sb, tid_t commit_tid), TP_ARGS(sb, commit_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, tid) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->tid = commit_tid; ), TP_printk("dev %d,%d tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid) ); TRACE_EVENT(ext4_fc_commit_stop, TP_PROTO(struct super_block *sb, int nblks, int reason, tid_t commit_tid), TP_ARGS(sb, nblks, reason, commit_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(int, nblks) __field(int, reason) __field(int, num_fc) __field(int, num_fc_ineligible) __field(int, nblks_agg) __field(tid_t, tid) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nblks = nblks; __entry->reason = reason; __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits; __entry->num_fc_ineligible = EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits; __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks; __entry->tid = commit_tid; ), TP_printk("dev %d,%d nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d, tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nblks, __entry->reason, __entry->num_fc, __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid) ); #define FC_REASON_NAME_STAT(reason) \ show_fc_reason(reason), \ __entry->fc_ineligible_rc[reason] TRACE_EVENT(ext4_fc_stats, TP_PROTO(struct super_block *sb), TP_ARGS(sb), TP_STRUCT__entry( __field(dev_t, dev) __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX) __field(unsigned long, fc_commits) __field(unsigned long, fc_ineligible_commits) __field(unsigned long, fc_numblks) ), TP_fast_assign( int i; __entry->dev = sb->s_dev; for (i = 0; i < EXT4_FC_REASON_MAX; i++) { __entry->fc_ineligible_rc[i] = EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i]; } __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits; __entry->fc_ineligible_commits = EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits; __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks; ), TP_printk("dev %d,%d fc ineligible reasons:\n" "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u" "num_commits:%lu, ineligible: %lu, numblks: %lu", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME), __entry->fc_commits, __entry->fc_ineligible_commits, __entry->fc_numblks) ); DECLARE_EVENT_CLASS(ext4_fc_track_dentry, TP_PROTO(handle_t *handle, struct inode *inode, struct dentry *dentry, int ret), TP_ARGS(handle, inode, dentry, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->error = ret; ), TP_printk("dev %d,%d, t_tid %u, ino %lu, i_sync_tid %u, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error ) ); #define DEFINE_EVENT_CLASS_DENTRY(__type) \ DEFINE_EVENT(ext4_fc_track_dentry, ext4_fc_track_##__type, \ TP_PROTO(handle_t *handle, struct inode *inode, \ struct dentry *dentry, int ret), \ TP_ARGS(handle, inode, dentry, ret) \ ) DEFINE_EVENT_CLASS_DENTRY(create); DEFINE_EVENT_CLASS_DENTRY(link); DEFINE_EVENT_CLASS_DENTRY(unlink); TRACE_EVENT(ext4_fc_track_inode, TP_PROTO(handle_t *handle, struct inode *inode, int ret), TP_ARGS(handle, inode, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->error = ret; ), TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error) ); TRACE_EVENT(ext4_fc_track_range, TP_PROTO(handle_t *handle, struct inode *inode, long start, long end, int ret), TP_ARGS(handle, inode, start, end, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(long, start) __field(long, end) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->start = start; __entry->end = end; __entry->error = ret; ), TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d, start %ld, end %ld", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error, __entry->start, __entry->end) ); TRACE_EVENT(ext4_fc_cleanup, TP_PROTO(journal_t *journal, int full, tid_t tid), TP_ARGS(journal, full, tid), TP_STRUCT__entry( __field(dev_t, dev) __field(int, j_fc_off) __field(int, full) __field(tid_t, tid) ), TP_fast_assign( struct super_block *sb = journal->j_private; __entry->dev = sb->s_dev; __entry->j_fc_off = journal->j_fc_off; __entry->full = full; __entry->tid = tid; ), TP_printk("dev %d,%d, j_fc_off %d, full %d, tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->j_fc_off, __entry->full, __entry->tid) ); TRACE_EVENT(ext4_update_sb, TP_PROTO(struct super_block *sb, ext4_fsblk_t fsblk, unsigned int flags), TP_ARGS(sb, fsblk, flags), TP_STRUCT__entry( __field(dev_t, dev) __field(ext4_fsblk_t, fsblk) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->fsblk = fsblk; __entry->flags = flags; ), TP_printk("dev %d,%d fsblk %llu flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fsblk, __entry->flags) ); #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in a * new subdirectory with this name. Additionally when a * group is named, @is_visible and @is_bin_visible may * return SYSFS_GROUP_INVISIBLE to control visibility of * the directory itself. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for * each non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned * value will replace static permissions defined in struct * attribute. Use SYSFS_GROUP_VISIBLE() when assigning this * callback to specify separate _group_visible() and * _attr_visible() handlers. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC (and the * visibility flags for named groups) are accepted. Must * return 0 if a binary attribute is not visible. The * returned value will replace static permissions defined * in struct bin_attribute. If @is_visible is not set, Use * SYSFS_GROUP_VISIBLE() when assigning this callback to * specify separate _group_visible() and _attr_visible() * handlers. * @bin_size: * Optional: Function to return the size of a binary attribute * of the group. Will be called repeatedly for each binary * attribute in the group. Overwrites the size field embedded * inside the attribute itself. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, const struct bin_attribute *, int); size_t (*bin_size)(struct kobject *, const struct bin_attribute *, int); struct attribute **attrs; union { struct bin_attribute **bin_attrs; const struct bin_attribute *const *bin_attrs_new; }; }; #define SYSFS_PREALLOC 010000 #define SYSFS_GROUP_INVISIBLE 020000 /* * DEFINE_SYSFS_GROUP_VISIBLE(name): * A helper macro to pair with the assignment of ".is_visible = * SYSFS_GROUP_VISIBLE(name)", that arranges for the directory * associated with a named attribute_group to optionally be hidden. * This allows for static declaration of attribute_groups, and the * simplification of attribute visibility lifetime that implies, * without polluting sysfs with empty attribute directories. * Ex. * * static umode_t example_attr_visible(struct kobject *kobj, * struct attribute *attr, int n) * { * if (example_attr_condition) * return 0; * else if (ro_attr_condition) * return 0444; * return a->mode; * } * * static bool example_group_visible(struct kobject *kobj) * { * if (example_group_condition) * return false; * return true; * } * * DEFINE_SYSFS_GROUP_VISIBLE(example); * * static struct attribute_group example_group = { * .name = "example", * .is_visible = SYSFS_GROUP_VISIBLE(example), * .attrs = &example_attrs, * }; * * Note that it expects <name>_attr_visible and <name>_group_visible to * be defined. For cases where individual attributes do not need * separate visibility consideration, only entire group visibility at * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(). */ #define DEFINE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, struct attribute *attr, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return name##_attr_visible(kobj, attr, n); \ } /* * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name): * A helper macro to pair with SYSFS_GROUP_VISIBLE() that like * DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does * not require the implementation of a per-attribute visibility * callback. * Ex. * * static bool example_group_visible(struct kobject *kobj) * { * if (example_group_condition) * return false; * return true; * } * * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example); * * static struct attribute_group example_group = { * .name = "example", * .is_visible = SYSFS_GROUP_VISIBLE(example), * .attrs = &example_attrs, * }; */ #define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, struct attribute *a, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return a->mode; \ } /* * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary * attributes. If an attribute_group defines both text and binary * attributes, the group visibility is determined by the function * specified to is_visible() not is_bin_visible() */ #define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, const struct bin_attribute *attr, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return name##_attr_visible(kobj, attr, n); \ } #define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, const struct bin_attribute *a, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return a->mode; \ } #define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .show = _name##_show, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RW_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ .store = _name##_store, \ } #define __ATTR_WO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .store = _name##_store, \ } #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct address_space; struct bin_attribute { struct attribute attr; size_t size; void *private; struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write_new)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) typedef ssize_t __sysfs_bin_rw_handler_new(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _Generic(_read, \ __sysfs_bin_rw_handler_new * : NULL, \ default : _read \ ), \ .read_new = _Generic(_read, \ __sysfs_bin_rw_handler_new * : _read, \ default : NULL \ ), \ .write = _Generic(_write, \ __sysfs_bin_rw_handler_new * : NULL, \ default : _write \ ), \ .write_new = _Generic(_write, \ __sysfs_bin_rw_handler_new * : _write, \ default : NULL \ ), \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) \ __BIN_ATTR(_name, 0444, _name##_read, NULL, _size) #define __BIN_ATTR_WO(_name, _size) \ __BIN_ATTR(_name, 0200, NULL, _name##_write, _size) #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) #define __BIN_ATTR_ADMIN_RO(_name, _size) \ __BIN_ATTR(_name, 0400, _name##_read, NULL, _size) #define __BIN_ATTR_ADMIN_RW(_name, _size) \ __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) #define BIN_ATTR_ADMIN_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) #define BIN_ATTR_ADMIN_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) #define __BIN_ATTR_SIMPLE_RO(_name, _mode) \ __BIN_ATTR(_name, _mode, sysfs_bin_attr_simple_read, NULL, 0) #define BIN_ATTR_SIMPLE_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444) #define BIN_ATTR_SIMPLE_ADMIN_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0400) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } #endif /* _SYSFS_H_ */
52 52 52 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 // SPDX-License-Identifier: GPL-2.0-only /* Connection state tracking for netfilter. This is separated from, but required by, the NAT layer; it can also be used by an iptables extension. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (C) 2005-2012 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/siphash.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/moduleparam.h> #include <linux/notifier.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/mm.h> #include <linux/nsproxy.h> #include <linux/rculist_nulls.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netns/hash.h> #include <net/ip.h> #include "nf_internals.h" __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); struct hlist_nulls_head *nf_conntrack_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash); struct conntrack_gc_work { struct delayed_work dwork; u32 next_bucket; u32 avg_timeout; u32 count; u32 start_time; bool exiting; bool early_drop; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; /* serialize hash resizes and nf_ct_iterate_cleanup */ static DEFINE_MUTEX(nf_conntrack_mutex); #define GC_SCAN_INTERVAL_MAX (60ul * HZ) #define GC_SCAN_INTERVAL_MIN (1ul * HZ) /* clamp timeouts to this value (TCP unacked) */ #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) /* Initial bias pretending we have 100 entries at the upper bound so we don't * wakeup often just because we have three entries with a 1s timeout while still * allowing non-idle machines to wakeup more often when needed. */ #define GC_SCAN_INITIAL_COUNT 100 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) #define GC_SCAN_EXPIRED_MAX (64000u / HZ) #define MIN_CHAINLEN 50u #define MAX_CHAINLEN (80u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { /* 1) Acquire the lock */ spin_lock(lock); /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics * It pairs with the smp_store_release() in nf_conntrack_all_unlock() */ if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) return; /* fast path failed, unlock */ spin_unlock(lock); /* Slow path 1) get global lock */ spin_lock(&nf_conntrack_locks_all_lock); /* Slow path 2) get the lock we want */ spin_lock(lock); /* Slow path 3) release the global lock */ spin_unlock(&nf_conntrack_locks_all_lock); } EXPORT_SYMBOL_GPL(nf_conntrack_lock); static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; spin_unlock(&nf_conntrack_locks[h1]); if (h1 != h2) spin_unlock(&nf_conntrack_locks[h2]); } /* return true if we need to recompute hashes (in case hash table was resized) */ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, unsigned int h2, unsigned int sequence) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; if (h1 <= h2) { nf_conntrack_lock(&nf_conntrack_locks[h1]); if (h1 != h2) spin_lock_nested(&nf_conntrack_locks[h2], SINGLE_DEPTH_NESTING); } else { nf_conntrack_lock(&nf_conntrack_locks[h2]); spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { nf_conntrack_double_unlock(h1, h2); return true; } return false; } static void nf_conntrack_all_lock(void) __acquires(&nf_conntrack_locks_all_lock) { int i; spin_lock(&nf_conntrack_locks_all_lock); /* For nf_contrack_locks_all, only the latest time when another * CPU will see an update is controlled, by the "release" of the * spin_lock below. * The earliest time is not controlled, an thus KCSAN could detect * a race when nf_conntract_lock() reads the variable. * WRITE_ONCE() is used to ensure the compiler will not * optimize the write. */ WRITE_ONCE(nf_conntrack_locks_all, true); for (i = 0; i < CONNTRACK_LOCKS; i++) { spin_lock(&nf_conntrack_locks[i]); /* This spin_unlock provides the "release" to ensure that * nf_conntrack_locks_all==true is visible to everyone that * acquired spin_lock(&nf_conntrack_locks[]). */ spin_unlock(&nf_conntrack_locks[i]); } } static void nf_conntrack_all_unlock(void) __releases(&nf_conntrack_locks_all_lock) { /* All prior stores must be complete before we clear * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() * might observe the false value but not the entire * critical section. * It pairs with the smp_load_acquire() in nf_conntrack_lock() */ smp_store_release(&nf_conntrack_locks_all, false); spin_unlock(&nf_conntrack_locks_all_lock); } unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; static siphash_aligned_key_t nf_conntrack_hash_rnd; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, unsigned int zoneid, const struct net *net) { siphash_key_t key; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); key = nf_conntrack_hash_rnd; key.key[0] ^= zoneid; key.key[1] ^= net_hash_mix(net); return siphash((void *)tuple, offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), &key); } static u32 scale_hash(u32 hash) { return reciprocal_scale(hash, nf_conntrack_htable_size); } static u32 __hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, unsigned int zoneid, unsigned int size) { return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); } static u32 hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, unsigned int zoneid) { return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); } static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { struct { __be16 sport; __be16 dport; } _inet_hdr, *inet_hdr; /* Actually only need first 4 bytes to get ports. */ inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); if (!inet_hdr) return false; tuple->src.u.udp.port = inet_hdr->sport; tuple->dst.u.udp.port = inet_hdr->dport; return true; } static bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, struct net *net, struct nf_conntrack_tuple *tuple) { unsigned int size; const __be32 *ap; __be32 _addrs[8]; memset(tuple, 0, sizeof(*tuple)); tuple->src.l3num = l3num; switch (l3num) { case NFPROTO_IPV4: nhoff += offsetof(struct iphdr, saddr); size = 2 * sizeof(__be32); break; case NFPROTO_IPV6: nhoff += offsetof(struct ipv6hdr, saddr); size = sizeof(_addrs); break; default: return true; } ap = skb_header_pointer(skb, nhoff, size, _addrs); if (!ap) return false; switch (l3num) { case NFPROTO_IPV4: tuple->src.u3.ip = ap[0]; tuple->dst.u3.ip = ap[1]; break; case NFPROTO_IPV6: memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); break; } tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; switch (protonum) { #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); #endif case IPPROTO_ICMP: return icmp_pkt_to_tuple(skb, dataoff, net, tuple); #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return gre_pkt_to_tuple(skb, dataoff, net, tuple); #endif case IPPROTO_TCP: case IPPROTO_UDP: #ifdef CONFIG_NF_CT_PROTO_UDPLITE case IPPROTO_UDPLITE: #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: #endif #ifdef CONFIG_NF_CT_PROTO_DCCP case IPPROTO_DCCP: #endif /* fallthrough */ return nf_ct_get_tuple_ports(skb, dataoff, tuple); default: break; } return true; } static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u_int8_t *protonum) { int dataoff = -1; const struct iphdr *iph; struct iphdr _iph; iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (!iph) return -1; /* Conntrack defragments packets, we might still see fragments * inside ICMP packets though. */ if (iph->frag_off & htons(IP_OFFSET)) return -1; dataoff = nhoff + (iph->ihl << 2); *protonum = iph->protocol; /* Check bogus IP headers */ if (dataoff > skb->len) { pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", nhoff, iph->ihl << 2, skb->len); return -1; } return dataoff; } #if IS_ENABLED(CONFIG_IPV6) static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 *protonum) { int protoff = -1; unsigned int extoff = nhoff + sizeof(struct ipv6hdr); __be16 frag_off; u8 nexthdr; if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), &nexthdr, sizeof(nexthdr)) != 0) { pr_debug("can't get nexthdr\n"); return -1; } protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); /* * (protoff == skb->len) means the packet has not data, just * IPv6 and possibly extensions headers, but it is tracked anyway */ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("can't find proto in pkt\n"); return -1; } *protonum = nexthdr; return protoff; } #endif static int get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 pf, u8 *l4num) { switch (pf) { case NFPROTO_IPV4: return ipv4_get_l4proto(skb, nhoff, l4num); #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: return ipv6_get_l4proto(skb, nhoff, l4num); #endif default: *l4num = 0; break; } return -1; } bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple) { u8 protonum; int protoff; protoff = get_l4proto(skb, nhoff, l3num, &protonum); if (protoff <= 0) return false; return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); } EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig) { memset(inverse, 0, sizeof(*inverse)); inverse->src.l3num = orig->src.l3num; switch (orig->src.l3num) { case NFPROTO_IPV4: inverse->src.u3.ip = orig->dst.u3.ip; inverse->dst.u3.ip = orig->src.u3.ip; break; case NFPROTO_IPV6: inverse->src.u3.in6 = orig->dst.u3.in6; inverse->dst.u3.in6 = orig->src.u3.in6; break; default: break; } inverse->dst.dir = !orig->dst.dir; inverse->dst.protonum = orig->dst.protonum; switch (orig->dst.protonum) { case IPPROTO_ICMP: return nf_conntrack_invert_icmp_tuple(inverse, orig); #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return nf_conntrack_invert_icmpv6_tuple(inverse, orig); #endif } inverse->src.u.all = orig->dst.u.all; inverse->dst.u.all = orig->src.u.all; return true; } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); /* Generate a almost-unique pseudo-id for a given conntrack. * * intentionally doesn't re-use any of the seeds used for hash * table location, we assume id gets exposed to userspace. * * Following nf_conn items do not change throughout lifetime * of the nf_conn: * * 1. nf_conn address * 2. nf_conn->master address (normally NULL) * 3. the associated net namespace * 4. the original direction tuple */ u32 nf_ct_get_id(const struct nf_conn *ct) { static siphash_aligned_key_t ct_id_seed; unsigned long a, b, c, d; net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); a = (unsigned long)ct; b = (unsigned long)ct->master; c = (unsigned long)nf_ct_net(ct); d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), &ct_id_seed); #ifdef CONFIG_64BIT return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); #else return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); #endif } EXPORT_SYMBOL_GPL(nf_ct_get_id); static void clean_from_lists(struct nf_conn *ct) { hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); /* Destroy all pending expectations */ nf_ct_remove_expectations(ct); } #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) /* Released via nf_ct_destroy() */ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags) { struct nf_conn *tmpl, *p; if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); if (!tmpl) return NULL; p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); if (tmpl != p) { tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) return NULL; } tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); nf_ct_zone_add(tmpl, zone); refcount_set(&tmpl->ct_general.use, 1); return tmpl; } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { kfree(tmpl->ext); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); else kfree(tmpl); } EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); static void destroy_gre_conntrack(struct nf_conn *ct) { #ifdef CONFIG_NF_CT_PROTO_GRE struct nf_conn *master = ct->master; if (master) nf_ct_gre_keymap_destroy(master); #endif } void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); return; } if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) destroy_gre_conntrack(ct); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ nf_ct_remove_expectations(ct); if (ct->master) nf_ct_put(ct->master); nf_conntrack_free(ct); } EXPORT_SYMBOL(nf_ct_destroy); static void __nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; unsigned int sequence; do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); clean_from_lists(ct); nf_conntrack_double_unlock(hash, reply_hash); } static void nf_ct_delete_from_lists(struct nf_conn *ct) { nf_ct_helper_destroy(ct); local_bh_disable(); __nf_ct_delete_from_lists(ct); local_bh_enable(); } static void nf_ct_add_to_ecache_list(struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); spin_lock(&cnet->ecache.dying_lock); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &cnet->ecache.dying_list); spin_unlock(&cnet->ecache.dying_lock); #endif } bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; struct net *net; if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) return false; tstamp = nf_conn_tstamp_find(ct); if (tstamp) { s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; tstamp->stop = ktime_get_real_ns(); if (timeout < 0) tstamp->stop -= jiffies_to_nsecs(-timeout); } if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { /* destroy event was not delivered. nf_ct_put will * be done by event cache worker on redelivery. */ nf_ct_helper_destroy(ct); local_bh_disable(); __nf_ct_delete_from_lists(ct); nf_ct_add_to_ecache_list(ct); local_bh_enable(); nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); return false; } net = nf_ct_net(ct); if (nf_conntrack_ecache_dwork_pending(net)) nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; } EXPORT_SYMBOL_GPL(nf_ct_delete); static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, const struct net *net) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); /* A conntrack can be recreated with the equal tuple, * so we need to check that the conntrack is confirmed */ return nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && nf_ct_is_confirmed(ct) && net_eq(net, nf_ct_net(ct)); } static inline bool nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) { return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); } /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { if (!refcount_inc_not_zero(&ct->ct_general.use)) return; /* load ->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) nf_ct_kill(ct); nf_ct_put(ct); } /* * Warning : * - Caller must take a reference on returned object * and recheck nf_ct_tuple_equal(tuple, &h->tuple) */ static struct nf_conntrack_tuple_hash * ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; unsigned int bucket, hsize; begin: nf_conntrack_get_ht(&ct_hash, &hsize); bucket = reciprocal_scale(hash, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { struct nf_conn *ct; ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { nf_ct_gc_expired(ct); continue; } if (nf_ct_key_equal(h, tuple, zone, net)) return h; } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(n) != bucket) { NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } return NULL; } /* Find a connection corresponding to a tuple. */ static struct nf_conntrack_tuple_hash * __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { /* We have a candidate that matches the tuple we're interested * in, try to obtain a reference and re-check tuple */ ct = nf_ct_tuplehash_to_ctrack(h); if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { /* re-check key after refcount */ smp_acquire__after_ctrl_dep(); if (likely(nf_ct_key_equal(h, tuple, zone, net))) return h; /* TYPESAFE_BY_RCU recycled the candidate */ nf_ct_put(ct); } h = NULL; } return h; } struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); struct nf_conntrack_tuple_hash *thash; rcu_read_lock(); thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, zone_id, net)); if (thash) goto out_unlock; rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (rid != zone_id) thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, rid, net)); out_unlock: rcu_read_unlock(); return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int reply_hash) { hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &nf_conntrack_hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[reply_hash]); } static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) { /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions * may contain stale pointers to e.g. helper that has been removed. * * The helper can't clear this because the nf_conn object isn't in * any hash and synchronize_rcu() isn't enough because associated skb * might sit in a queue. */ return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); } static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) { if (!ext) return true; if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) return false; /* inserted into conntrack table, nf_ct_iterate_cleanup() * will find it. Disable nf_ct_ext_find() id check. */ WRITE_ONCE(ext->gen_id, 0); return true; } int nf_conntrack_hash_check_insert(struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int max_chainlen; unsigned int chainlen = 0; unsigned int sequence; int err = -EEXIST; zone = nf_ct_zone(ct); if (!nf_ct_ext_valid_pre(ct->ext)) return -EAGAIN; local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } chainlen = 0; hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } /* If genid has changed, we can't insert anymore because ct * extensions could have stale pointers and nf_ct_iterate_destroy * might have completed its table scan already. * * Increment of the ext genid right after this check is fine: * nf_ct_iterate_destroy blocks until locks are released. */ if (!nf_ct_ext_valid_post(ct->ext)) { err = -EAGAIN; goto out; } smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); local_bh_enable(); return 0; chaintoolong: NF_CT_STAT_INC(net, chaintoolong); err = -ENOSPC; out: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return err; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, unsigned int bytes) { struct nf_conn_acct *acct; acct = nf_conn_acct_find(ct); if (acct) { struct nf_conn_counter *counter = acct->counter; atomic64_add(packets, &counter[dir].packets); atomic64_add(bytes, &counter[dir].bytes); } } EXPORT_SYMBOL_GPL(nf_ct_acct_add); static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn *loser_ct) { struct nf_conn_acct *acct; acct = nf_conn_acct_find(loser_ct); if (acct) { struct nf_conn_counter *counter = acct->counter; unsigned int bytes; /* u32 should be fine since we must have seen one packet. */ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } } static void __nf_conntrack_insert_prepare(struct nf_conn *ct) { struct nf_conn_tstamp *tstamp; refcount_inc(&ct->ct_general.use); /* set conntrack timestamp, if enabled. */ tstamp = nf_conn_tstamp_find(ct); if (tstamp) tstamp->start = ktime_get_real_ns(); } /** * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow * @ct1: conntrack in hash table to check against * @ct2: merge candidate * * returns true if ct1 and ct2 happen to refer to the same flow, but * in opposing directions, i.e. * ct1: a:b -> c:d * ct2: c:d -> a:b * for both directions. If so, @ct2 should not have been created * as the skb should have been picked up as ESTABLISHED flow. * But ct1 was not yet committed to hash table before skb that created * ct2 had arrived. * * Note we don't compare netns because ct entries in different net * namespace cannot clash to begin with. * * @return: true if ct1 and ct2 are identical when swapping origin/reply. */ static bool nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) { u16 id1, id2; if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) return false; if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) return false; id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); if (id1 != id2) return false; id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); return id1 == id2; } static int nf_ct_can_merge(const struct nf_conn *ct, const struct nf_conn *loser_ct) { return nf_ct_match(ct, loser_ct) || nf_ct_match_reverse(ct, loser_ct); } /* caller must hold locks to prevent concurrent changes */ static int __nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); enum ip_conntrack_info ctinfo; struct nf_conn *loser_ct; loser_ct = nf_ct_get(skb, &ctinfo); if (nf_ct_can_merge(ct, loser_ct)) { struct net *net = nf_ct_net(ct); nf_conntrack_get(&ct->ct_general); nf_ct_acct_merge(ct, ctinfo, loser_ct); nf_ct_put(loser_ct); nf_ct_set(skb, ct, ctinfo); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } return NF_DROP; } /** * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry * * @skb: skb that causes the collision * @repl_idx: hash slot for reply direction * * Called when origin or reply direction had a clash. * The skb can be handled without packet drop provided the reply direction * is unique or there the existing entry has the identical tuple in both * directions. * * Caller must hold conntrack table locks to prevent concurrent updates. * * Returns NF_DROP if the clash could not be handled. */ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) { struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct net *net; zone = nf_ct_zone(loser_ct); net = nf_ct_net(loser_ct); /* Reply direction must never result in a clash, unless both origin * and reply tuples are identical. */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { if (nf_ct_key_equal(h, &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) return __nf_ct_resolve_clash(skb, h); } /* We want the clashing entry to go away real soon: 1 second timeout. */ WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); /* IPS_NAT_CLASH removes the entry automatically on the first * reply. Also prevents UDP tracker from moving the entry to * ASSURED state, i.e. the entry can always be evicted under * pressure. */ loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; __nf_conntrack_insert_prepare(loser_ct); /* fake add for ORIGINAL dir: we want lookups to only find the entry * already in the table. This also hides the clashing entry from * ctnetlink iteration, i.e. conntrack -L won't show them. */ hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table * @reply_hash: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. * * If there is one, @skb (and the associated, unconfirmed conntrack) has * to be dropped. In case @skb is retransmitted, next conntrack lookup * will find the already-existing entry. * * The major problem with such packet drop is the extra delay added by * the packet loss -- it will take some time for a retransmit to occur * (or the sender to time out when waiting for a reply). * * This function attempts to handle the situation without packet drop. * * If @skb has no NAT transformation or if the colliding entries are * exactly the same, only the to-be-confirmed conntrack entry is discarded * and @skb is associated with the conntrack entry already in the table. * * Failing that, the new, unconfirmed conntrack is still added to the table * provided that the collision only occurs in the ORIGINAL direction. * The new entry will be added only in the non-clashing REPLY direction, * so packets in the ORIGINAL direction will continue to match the existing * entry. The new entry will also have a fixed timeout so it expires -- * due to the collision, it will only see reply traffic. * * Returns NF_DROP if the clash could not be resolved. */ static __cold noinline int nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, u32 reply_hash) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); const struct nf_conntrack_l4proto *l4proto; enum ip_conntrack_info ctinfo; struct nf_conn *loser_ct; struct net *net; int ret; loser_ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(loser_ct); l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->allow_clash) goto drop; ret = __nf_ct_resolve_clash(skb, h); if (ret == NF_ACCEPT) return ret; ret = nf_ct_resolve_clash_harder(skb, reply_hash); if (ret == NF_ACCEPT) return ret; drop: NF_CT_STAT_INC(net, drop); NF_CT_STAT_INC(net, insert_failed); return NF_DROP; } /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) { unsigned int chainlen = 0, sequence, max_chainlen; const struct nf_conntrack_zone *zone; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); /* ipt_REJECT uses nf_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet which created connection will be IP_CT_NEW or for an expected connection, IP_CT_RELATED. */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return NF_ACCEPT; zone = nf_ct_zone(ct); local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = scale_hash(hash); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related * connections for unconfirmed conns. But packet copies and * REJECT will give spurious warnings here. */ /* Another skb with the same unconfirmed conntrack may * win the race. This may happen for bridge(br_flood) * or broadcast/multicast packets do skb_clone with * unconfirmed conntrack. */ if (unlikely(nf_ct_is_confirmed(ct))) { WARN_ON_ONCE(1); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return NF_DROP; } if (!nf_ct_ext_valid_pre(ct->ext)) { NF_CT_STAT_INC(net, insert_failed); goto dying; } /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking * further use of that particular connection -JM. */ ct->status |= IPS_CONFIRMED; if (unlikely(nf_ct_is_dying(ct))) { NF_CT_STAT_INC(net, insert_failed); goto dying; } max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } chainlen = 0; hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) { chaintoolong: NF_CT_STAT_INC(net, chaintoolong); NF_CT_STAT_INC(net, insert_failed); ret = NF_DROP; goto dying; } } /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers * guarantee that no other CPU can find the conntrack before the above * stores are visible. */ __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); /* ext area is still valid (rcu read lock is held, * but will go out of scope soon, we need to remove * this conntrack again. */ if (!nf_ct_ext_valid_post(ct->ext)) { nf_ct_kill(ct); NF_CT_STAT_INC_ATOMIC(net, drop); return NF_DROP; } help = nfct_help(ct); if (help && help->helper) nf_conntrack_event_cache(IPCT_HELPER, ct); nf_conntrack_event_cache(master_ct(ct) ? IPCT_RELATED : IPCT_NEW, ct); return NF_ACCEPT; out: ret = nf_ct_resolve_clash(skb, h, reply_hash); dying: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); /* Returns true if a connection corresponds to the tuple (required for NAT). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; unsigned int hash, hsize; struct hlist_nulls_node *n; struct nf_conn *ct; zone = nf_ct_zone(ignored_conntrack); rcu_read_lock(); begin: nf_conntrack_get_ht(&ct_hash, &hsize); hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct == ignored_conntrack) continue; if (nf_ct_is_expired(ct)) { nf_ct_gc_expired(ct); continue; } if (nf_ct_key_equal(h, tuple, zone, net)) { /* Tuple is taken already, so caller will need to find * a new source port to use. * * Only exception: * If the *original tuples* are identical, then both * conntracks refer to the same flow. * This is a rare situation, it can occur e.g. when * more than one UDP packet is sent from same socket * in different threads. * * Let nf_ct_resolve_clash() deal with this later. */ if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) continue; NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; } } if (get_nulls_value(n) != hash) { NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } rcu_read_unlock(); return 0; } EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); #define NF_CT_EVICTION_RANGE 8 /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ static unsigned int early_drop_list(struct net *net, struct hlist_nulls_head *head) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int drops = 0; struct nf_conn *tmp; hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; } if (test_bit(IPS_ASSURED_BIT, &tmp->status) || !net_eq(nf_ct_net(tmp), net) || nf_ct_is_dying(tmp)) continue; if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* load ->ct_net and ->status after refcount increase */ smp_acquire__after_ctrl_dep(); /* kill only if still in same netns -- might have moved due to * SLAB_TYPESAFE_BY_RCU rules. * * We steal the timer reference. If that fails timer has * already fired or someone else deleted it. Just drop ref * and move to next entry. */ if (net_eq(nf_ct_net(tmp), net) && nf_ct_is_confirmed(tmp) && nf_ct_delete(tmp, 0, 0)) drops++; nf_ct_put(tmp); } return drops; } static noinline int early_drop(struct net *net, unsigned int hash) { unsigned int i, bucket; for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; unsigned int hsize, drops; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hsize); if (!i) bucket = reciprocal_scale(hash, hsize); else bucket = (bucket + 1) % hsize; drops = early_drop_list(net, &ct_hash[bucket]); rcu_read_unlock(); if (drops) { NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); return true; } } return false; } static bool gc_worker_skip_ct(const struct nf_conn *ct) { return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); } static bool gc_worker_can_early_drop(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; u8 protonum = nf_ct_protonum(ct); if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; l4proto = nf_ct_l4proto_find(protonum); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; return false; } static void gc_worker(struct work_struct *work) { unsigned int i, hashsz, nf_conntrack_max95 = 0; u32 end_time, start_time = nfct_time_stamp; struct conntrack_gc_work *gc_work; unsigned int expired_count = 0; unsigned long next_run; s32 delta_time; long count; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); i = gc_work->next_bucket; if (gc_work->early_drop) nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; if (i == 0) { gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; gc_work->count = GC_SCAN_INITIAL_COUNT; gc_work->start_time = start_time; } next_run = gc_work->avg_timeout; count = gc_work->count; end_time = start_time + GC_SCAN_MAX_DURATION; do { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; struct nf_conn *tmp; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hashsz); if (i >= hashsz) { rcu_read_unlock(); break; } hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { struct nf_conntrack_net *cnet; struct net *net; long expires; tmp = nf_ct_tuplehash_to_ctrack(h); if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { nf_ct_offload_timeout(tmp); if (!nf_conntrack_max95) continue; } if (expired_count > GC_SCAN_EXPIRED_MAX) { rcu_read_unlock(); gc_work->next_bucket = i; gc_work->avg_timeout = next_run; gc_work->count = count; delta_time = nfct_time_stamp - gc_work->start_time; /* re-sched immediately if total cycle time is exceeded */ next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; goto early_exit; } if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); expired_count++; continue; } expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); expires = (expires - (long)next_run) / ++count; next_run += expires; if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) continue; net = nf_ct_net(tmp); cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; /* need to take reference to avoid possible races */ if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* load ->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (gc_worker_skip_ct(tmp)) { nf_ct_put(tmp); continue; } if (gc_worker_can_early_drop(tmp)) { nf_ct_kill(tmp); expired_count++; } nf_ct_put(tmp); } /* could check get_nulls_value() here and restart if ct * was moved to another chain. But given gc is best-effort * we will just continue with next hash slot. */ rcu_read_unlock(); cond_resched(); i++; delta_time = nfct_time_stamp - end_time; if (delta_time > 0 && i < hashsz) { gc_work->avg_timeout = next_run; gc_work->count = count; gc_work->next_bucket = i; next_run = 0; goto early_exit; } } while (i < hashsz); gc_work->next_bucket = 0; next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); if (next_run > (unsigned long)delta_time) next_run -= delta_time; else next_run = 1; early_exit: if (gc_work->exiting) return; if (next_run) gc_work->early_drop = false; queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); gc_work->exiting = false; } static struct nf_conn * __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); unsigned int ct_count; struct nf_conn *ct; /* We don't want any race condition at early drop stage */ ct_count = atomic_inc_return(&cnet->count); if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; atomic_dec(&cnet->count); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); } } /* * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_TYPESAFE_BY_RCU. */ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) goto out; spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; WRITE_ONCE(ct->timeout, 0); write_pnet(&ct->ct_net, net); memset_after(ct, 0, __nfct_init_offset); nf_ct_zone_add(ct, zone); /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ refcount_set(&ct->ct_general.use, 0); return ct; out: atomic_dec(&cnet->count); return ERR_PTR(-ENOMEM); } struct nf_conn *nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp) { return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); } EXPORT_SYMBOL_GPL(nf_conntrack_alloc); void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); struct nf_conntrack_net *cnet; /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU */ WARN_ON(refcount_read(&ct->ct_general.use) != 0); if (ct->status & IPS_SRC_NAT_DONE) { const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook) nat_hook->remove_nat_bysrc(ct); rcu_read_unlock(); } kfree(ct->ext); kmem_cache_free(nf_conntrack_cachep, ct); cnet = nf_ct_pernet(net); smp_mb__before_atomic(); atomic_dec(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct sk_buff *skb, unsigned int dataoff, u32 hash) { struct nf_conn *ct; struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_ecache *ecache; #endif struct nf_conntrack_expect *exp = NULL; const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; struct nf_conntrack_net *cnet; if (!nf_ct_invert_tuple(&repl_tuple, tuple)) return NULL; zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) return ERR_CAST(ct); if (!nf_ct_add_synproxy(ct, tmpl)) { nf_conntrack_free(ct); return ERR_PTR(-ENOMEM); } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; if (timeout_ext) nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), GFP_ATOMIC); nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_labels_ext_add(ct); #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; if ((ecache || net->ct.sysctl_events) && !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, ecache ? ecache->expmask : 0, GFP_ATOMIC)) { nf_conntrack_free(ct); return ERR_PTR(-ENOMEM); } #endif cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; if (exp->helper) { help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) rcu_assign_pointer(help->helper, exp->helper); } #ifdef CONFIG_NF_CONNTRACK_MARK ct->mark = READ_ONCE(exp->master->mark); #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK ct->secmark = exp->master->secmark; #endif NF_CT_STAT_INC(net, expect_new); } spin_unlock_bh(&nf_conntrack_expect_lock); } if (!exp && tmpl) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); /* Other CPU might have obtained a pointer to this object before it was * released. Because refcount is 0, refcount_inc_not_zero() will fail. * * After refcount_set(1) it will succeed; ensure that zeroing of * ct->status and the correct ct->net pointer are visible; else other * core might observe CONFIRMED bit which means the entry is valid and * in the hash table, but its not (anymore). */ smp_wmb(); /* Now it is going to be associated with an sk_buff, set refcount to 1. */ refcount_set(&ct->ct_general.use, 1); if (exp) { if (exp->expectfn) exp->expectfn(ct, exp); nf_ct_expect_put(exp); } return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; } /* On success, returns 0, sets skb->_nfct | ctinfo */ static int resolve_normal_ct(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int8_t protonum, const struct nf_hook_state *state) { const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; u32 hash, zone_id, rid; struct nf_conn *ct; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, &tuple)) return 0; /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); hash = hash_conntrack_raw(&tuple, zone_id, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); if (!h) { rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (zone_id != rid) { u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); } } if (!h) { h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); if (!h) return 0; if (IS_ERR(h)) return PTR_ERR(h); } ct = nf_ct_tuplehash_to_ctrack(h); /* It exists; we have (non-exclusive) reference. */ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { ctinfo = IP_CT_ESTABLISHED_REPLY; } else { unsigned long status = READ_ONCE(ct->status); /* Once we've had two way comms, always ESTABLISHED. */ if (likely(status & IPS_SEEN_REPLY)) ctinfo = IP_CT_ESTABLISHED; else if (status & IPS_EXPECTED) ctinfo = IP_CT_RELATED; else ctinfo = IP_CT_NEW; } nf_ct_set(skb, ct, ctinfo); return 0; } /* * icmp packets need special treatment to handle error messages that are * related to a connection. * * Callers need to check if skb has a conntrack assigned when this * helper returns; in such case skb belongs to an already known connection. */ static unsigned int __cold nf_conntrack_handle_icmp(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u8 protonum, const struct nf_hook_state *state) { int ret; if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); #if IS_ENABLED(CONFIG_IPV6) else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); #endif else return NF_ACCEPT; if (ret <= 0) NF_CT_STAT_INC_ATOMIC(state->net, error); return ret; } static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo) { const unsigned int *timeout = nf_ct_timeout_lookup(ct); if (!timeout) timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Returns verdict for packet, or -1 for invalid. */ static int nf_conntrack_handle_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: return nf_conntrack_tcp_packet(ct, skb, dataoff, ctinfo, state); case IPPROTO_UDP: return nf_conntrack_udp_packet(ct, skb, dataoff, ctinfo, state); case IPPROTO_ICMP: return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_UDPLITE case IPPROTO_UDPLITE: return nf_conntrack_udplite_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return nf_conntrack_sctp_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_DCCP case IPPROTO_DCCP: return nf_conntrack_dccp_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return nf_conntrack_gre_packet(ct, skb, dataoff, ctinfo, state); #endif } return generic_packet(ct, skb, ctinfo); } unsigned int nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) { enum ip_conntrack_info ctinfo; struct nf_conn *ct, *tmpl; u_int8_t protonum; int dataoff, ret; tmpl = nf_ct_get(skb, &ctinfo); if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || ctinfo == IP_CT_UNTRACKED) return NF_ACCEPT; skb->_nfct = 0; } /* rcu_read_lock()ed by nf_hook_thresh */ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, protonum, state); if (ret <= 0) { ret = -ret; goto out; } /* ICMP[v6] protocol trackers may assign one conntrack. */ if (skb->_nfct) goto out; } repeat: ret = resolve_normal_ct(tmpl, skb, dataoff, protonum, state); if (ret < 0) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = NF_DROP; goto out; } ct = nf_ct_get(skb, &ctinfo); if (!ct) { /* Not valid part of a connection */ NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ nf_ct_put(ct); skb->_nfct = 0; /* Special case: TCP tracker reports an attempt to reopen a * closed/aborted connection. We have to go back and create a * fresh conntrack. */ if (ret == -NF_REPEAT) goto repeat; NF_CT_STAT_INC_ATOMIC(state->net, invalid); if (ret == NF_DROP) NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = -ret; goto out; } if (ctinfo == IP_CT_ESTABLISHED_REPLY && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_REPLY, ct); out: if (tmpl) nf_ct_put(tmpl); return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_in); /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies, bool do_acct) { /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) goto acct; /* If not in hash table, timer will not be active yet */ if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; if (READ_ONCE(ct->timeout) != extra_jiffies) WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb) { nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); return nf_ct_delete(ct, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_kill_acct); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/mutex.h> /* Generic function for tcp/udp/sctp/dccp and alike. */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) goto nla_put_failure; return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, }; EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { if (!tb[CTA_PROTO_SRC_PORT]) return -EINVAL; t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { if (!tb[CTA_PROTO_DST_PORT]) return -EINVAL; t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); } return 0; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); unsigned int nf_ct_port_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); return size; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif /* Used by ipt_REJECT and ip6t_REJECT. */ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; /* This ICMP is in reverse direction to the packet which caused it */ ct = nf_ct_get(skb, &ctinfo); if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ctinfo = IP_CT_RELATED_REPLY; else ctinfo = IP_CT_RELATED; /* Attach to new skbuff, and increment count */ nf_ct_set(nskb, ct, ctinfo); nf_conntrack_get(skb_nfct(nskb)); } /* This packet is coming from userspace via nf_queue, complete the packet * processing after the helper invocation in nf_confirm(). */ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conntrack_helper *helper; const struct nf_conn_help *help; int protoff; help = nfct_help(ct); if (!help) return NF_ACCEPT; helper = rcu_dereference(help->helper); if (!helper) return NF_ACCEPT; if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) return NF_ACCEPT; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: protoff = skb_network_offset(skb) + ip_hdrlen(skb); break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: { __be16 frag_off; u8 pnum; pnum = ipv6_hdr(skb)->nexthdr; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) return NF_ACCEPT; break; } #endif default: return NF_ACCEPT; } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } } /* We've seen it coming out the other side: confirm it */ return nf_conntrack_confirm(skb); } static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; return nf_confirm_cthelper(skb, ct, ctinfo); } static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { const struct nf_conntrack_tuple *src_tuple; const struct nf_conntrack_tuple_hash *hash; struct nf_conntrack_tuple srctuple; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct) { src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); return true; } if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), NFPROTO_IPV4, dev_net(skb->dev), &srctuple)) return false; hash = nf_conntrack_find_get(dev_net(skb->dev), &nf_ct_zone_dflt, &srctuple); if (!hash) return false; ct = nf_ct_tuplehash_to_ctrack(hash); src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); nf_ct_put(ct); return true; } /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; if (hlist_nulls_empty(hslot)) continue; lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); hlist_nulls_for_each_entry(h, n, hslot, hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) continue; /* All nf_conn objects are added to hash table twice, one * for original direction tuple, once for the reply tuple. * * Exception: In the IPS_NAT_CLASH case, only the reply * tuple is added (the original tuple already existed for * a different object). * * We only need to call the iterator once for each * conntrack, so we just use the 'reply' direction * tuple while iterating. */ ct = nf_ct_tuplehash_to_ctrack(h); if (iter_data->net && !net_eq(iter_data->net, nf_ct_net(ct))) continue; if (iter(ct, iter_data->data)) goto found; } spin_unlock(lockp); local_bh_enable(); cond_resched(); } return NULL; found: refcount_inc(&ct->ct_general.use); spin_unlock(lockp); local_bh_enable(); return ct; } static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data) { unsigned int bucket = 0; struct nf_conn *ct; might_sleep(); mutex_lock(&nf_conntrack_mutex); while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { /* Time to push up daises... */ nf_ct_delete(ct, iter_data->portid, iter_data->report); nf_ct_put(ct); cond_resched(); } mutex_unlock(&nf_conntrack_mutex); } void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data) { struct net *net = iter_data->net; struct nf_conntrack_net *cnet = nf_ct_pernet(net); might_sleep(); if (atomic_read(&cnet->count) == 0) return; nf_ct_iterate_cleanup(iter, iter_data); } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); /** * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table * @iter: callback to invoke for each conntrack * @data: data to pass to @iter * * Like nf_ct_iterate_cleanup, but first marks conntracks on the * unconfirmed list as dying (so they will not be inserted into * main table). * * Can only be called in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) { struct nf_ct_iter_data iter_data = {}; struct net *net; down_read(&net_rwsem); for_each_net(net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) == 0) continue; nf_queue_nf_hook_drop(net); } up_read(&net_rwsem); /* Need to wait for netns cleanup worker to finish, if its * running -- it might have deleted a net namespace from * the global list, so hook drop above might not have * affected all namespaces. */ net_ns_barrier(); /* a skb w. unconfirmed conntrack could have been reinjected just * before we called nf_queue_nf_hook_drop(). * * This makes sure its inserted into conntrack table. */ synchronize_net(); nf_ct_ext_bump_genid(); iter_data.data = data; nf_ct_iterate_cleanup(iter, &iter_data); /* Another cpu might be in a rcu read section with * rcu protected pointer cleared in iter callback * or hidden via nf_ct_ext_bump_genid() above. * * Wait until those are done. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); static int kill_all(struct nf_conn *i, void *data) { return 1; } void nf_conntrack_cleanup_start(void) { cleanup_nf_conntrack_bpf(); conntrack_gc_work.exiting = true; } void nf_conntrack_cleanup_end(void) { RCU_INIT_POINTER(nf_ct_hook, NULL); cancel_delayed_work_sync(&conntrack_gc_work.dwork); kvfree(nf_conntrack_hash); nf_conntrack_proto_fini(); nf_conntrack_helper_fini(); nf_conntrack_expect_fini(); kmem_cache_destroy(nf_conntrack_cachep); } /* * Mishearing the voices in his head, our hero wonders how he's * supposed to kill the mall. */ void nf_conntrack_cleanup_net(struct net *net) { LIST_HEAD(single); list_add(&net->exit_list, &single); nf_conntrack_cleanup_net_list(&single); } void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) { struct nf_ct_iter_data iter_data = {}; struct net *net; int busy; /* * This makes sure all current packets have passed through * netfilter framework. Roll on, two-stage module * delete... */ synchronize_rcu_expedited(); i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); iter_data.net = net; nf_ct_iterate_cleanup_net(kill_all, &iter_data); if (atomic_read(&cnet->count) != 0) busy = 1; } if (busy) { schedule(); goto i_see_dead_people; } list_for_each_entry(net, net_exit_list, exit_list) { nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); } } void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) { struct hlist_nulls_head *hash; unsigned int nr_slots, i; if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) for (i = 0; i < nr_slots; i++) INIT_HLIST_NULLS_HEAD(&hash[i], i); return hash; } EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); int nf_conntrack_hash_resize(unsigned int hashsize) { int i, bucket; unsigned int old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; if (!hashsize) return -EINVAL; hash = nf_ct_alloc_hashtable(&hashsize, 1); if (!hash) return -ENOMEM; mutex_lock(&nf_conntrack_mutex); old_size = nf_conntrack_htable_size; if (old_size == hashsize) { mutex_unlock(&nf_conntrack_mutex); kvfree(hash); return 0; } local_bh_disable(); nf_conntrack_all_lock(); write_seqcount_begin(&nf_conntrack_generation); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections * created because of a false negative won't make it into the hash * though since that required taking the locks. */ for (i = 0; i < nf_conntrack_htable_size; i++) { while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { unsigned int zone_id; h = hlist_nulls_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); bucket = __hash_conntrack(nf_ct_net(ct), &h->tuple, zone_id, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } old_hash = nf_conntrack_hash; nf_conntrack_hash = hash; nf_conntrack_htable_size = hashsize; write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); local_bh_enable(); mutex_unlock(&nf_conntrack_mutex); synchronize_net(); kvfree(old_hash); return 0; } int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) { unsigned int hashsize; int rc; if (current->nsproxy->net_ns != &init_net) return -EOPNOTSUPP; /* On boot, we can set this without any fancy locking. */ if (!nf_conntrack_hash) return param_set_uint(val, kp); rc = kstrtouint(val, 0, &hashsize); if (rc) return rc; return nf_conntrack_hash_resize(hashsize); } int nf_conntrack_init_start(void) { unsigned long nr_pages = totalram_pages(); int max_factor = 8; int ret = -ENOMEM; int i; seqcount_spinlock_init(&nf_conntrack_generation, &nf_conntrack_locks_all_lock); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); if (!nf_conntrack_htable_size) { nf_conntrack_htable_size = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); if (BITS_PER_LONG >= 64 && nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) nf_conntrack_htable_size = 262144; else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 65536; if (nf_conntrack_htable_size < 1024) nf_conntrack_htable_size = 1024; /* Use a max. factor of one by default to keep the average * hash chain length at 2 entries. Each entry has to be added * twice (once for original direction, once for reply). * When a table size is given we use the old value of 8 to * avoid implicit reduction of the max entries setting. */ max_factor = 1; } nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); if (!nf_conntrack_hash) return -ENOMEM; nf_conntrack_max = max_factor * nf_conntrack_htable_size; nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), NFCT_INFOMASK + 1, SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; ret = nf_conntrack_expect_init(); if (ret < 0) goto err_expect; ret = nf_conntrack_helper_init(); if (ret < 0) goto err_helper; ret = nf_conntrack_proto_init(); if (ret < 0) goto err_proto; conntrack_gc_work_init(&conntrack_gc_work); queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); ret = register_nf_conntrack_bpf(); if (ret < 0) goto err_kfunc; return 0; err_kfunc: cancel_delayed_work_sync(&conntrack_gc_work.dwork); nf_conntrack_proto_fini(); err_proto: nf_conntrack_helper_fini(); err_helper: nf_conntrack_expect_fini(); err_expect: kmem_cache_destroy(nf_conntrack_cachep); err_cachep: kvfree(nf_conntrack_hash); return ret; } static void nf_conntrack_set_closing(struct nf_conntrack *nfct) { struct nf_conn *ct = nf_ct_to_nf_conn(nfct); switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: nf_conntrack_tcp_set_closing(ct); break; } } static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, .confirm = __nf_conntrack_confirm, }; void nf_conntrack_init_end(void) { RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); } /* * We need to use special "null" values, not used in hash table */ #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) int nf_conntrack_init_net(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); int ret = -ENOMEM; BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); atomic_set(&cnet->count, 0); net->ct.stat = alloc_percpu(struct ip_conntrack_stat); if (!net->ct.stat) return ret; ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; nf_conntrack_acct_pernet_init(net); nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); nf_conntrack_proto_pernet_init(net); return 0; err_expect: free_percpu(net->ct.stat); return ret; } /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) { if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) return -EPERM; __nf_ct_set_timeout(ct, timeout); if (test_bit(IPS_DYING_BIT, &ct->status)) return -ETIME; return 0; } EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) { unsigned int bit; /* Ignore these unchangable bits */ on &= ~IPS_UNCHANGEABLE_MASK; off &= ~IPS_UNCHANGEABLE_MASK; for (bit = 0; bit < __IPS_MAX_BIT; bit++) { if (on & (1 << bit)) set_bit(bit, &ct->status); else if (off & (1 << bit)) clear_bit(bit, &ct->status); } } EXPORT_SYMBOL_GPL(__nf_ct_change_status); int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) { unsigned long d; d = ct->status ^ status; if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) /* unchangeable */ return -EBUSY; if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) /* SEEN_REPLY bit can only be set */ return -EBUSY; if (d & IPS_ASSURED && !(status & IPS_ASSURED)) /* ASSURED bit can only be set */ return -EBUSY; __nf_ct_change_status(ct, status, 0); return 0; } EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
7 8 2 2 2 2 2 15 15 3 4 10 2 2 8 7 8 8 4 4 3 6 1 6 1 52 52 4 48 46 48 52 52 52 1 1 1 15 15 5 47 8 8 8 8 2 1 5 4 3 7 7 15 15 15 2 9 13 2 2 13 13 13 4 9 9 5 5 7 2 5 5 5 5 5 5 5 15 15 15 10 12 12 9 9 9 9 9 9 51 51 51 51 56 56 56 55 56 56 56 56 56 56 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <linux/rcupdate.h> #include <linux/fips.h> #include <linux/if_ether.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" static struct ieee80211_link_data * ieee80211_link_or_deflink(struct ieee80211_sub_if_data *sdata, int link_id, bool require_valid) { struct ieee80211_link_data *link; if (link_id < 0) { /* * For keys, if sdata is not an MLD, we might not use * the return value at all (if it's not a pairwise key), * so in that case (require_valid==false) don't error. */ if (require_valid && ieee80211_vif_is_mld(&sdata->vif)) return ERR_PTR(-EINVAL); return &sdata->deflink; } link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return ERR_PTR(-ENOLINK); return link; } static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { bool mu_mimo_groups = false; bool mu_mimo_follow = false; if (params->vht_mumimo_groups) { u64 membership; BUILD_BUG_ON(sizeof(membership) != WLAN_MEMBERSHIP_LEN); memcpy(sdata->vif.bss_conf.mu_group.membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); memcpy(sdata->vif.bss_conf.mu_group.position, params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_MU_GROUPS); /* don't care about endianness - just check for 0 */ memcpy(&membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); mu_mimo_groups = membership != 0; } if (params->vht_mumimo_follow_addr) { mu_mimo_follow = is_valid_ether_addr(params->vht_mumimo_follow_addr); ether_addr_copy(sdata->u.mntr.mu_follow_addr, params->vht_mumimo_follow_addr); } sdata->vif.bss_conf.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow; } static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *monitor_sdata; /* check flags first */ if (params->flags && ieee80211_sdata_running(sdata)) { u32 mask = MONITOR_FLAG_COOK_FRAMES | MONITOR_FLAG_ACTIVE; /* * Prohibit MONITOR_FLAG_COOK_FRAMES and * MONITOR_FLAG_ACTIVE to be changed while the * interface is up. * Else we would need to add a lot of cruft * to update everything: * cooked_mntrs, monitor and all fif_* counters * reconfigure hardware */ if ((params->flags & mask) != (sdata->u.mntr.flags & mask)) return -EBUSY; } /* also validate MU-MIMO change */ if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) monitor_sdata = sdata; else monitor_sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!monitor_sdata && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) return -EOPNOTSUPP; /* apply all changes now - no failures allowed */ if (monitor_sdata && (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { if (ieee80211_sdata_running(sdata)) { ieee80211_adjust_monitor_flags(sdata, -1); sdata->u.mntr.flags = params->flags; ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); } else { /* * Because the interface is down, ieee80211_do_stop * and ieee80211_do_open take care of "everything" * mentioned in the comment above. */ sdata->u.mntr.flags = params->flags; } } return 0; } static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, struct cfg80211_mbssid_config *params, struct ieee80211_bss_conf *link_conf) { struct ieee80211_sub_if_data *tx_sdata; sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev) return -EINVAL; tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev); if (!tx_sdata) return -EINVAL; if (tx_sdata == sdata) { sdata->vif.mbssid_tx_vif = &sdata->vif; } else { sdata->vif.mbssid_tx_vif = &tx_sdata->vif; link_conf->nontransmitted = true; link_conf->bssid_index = params->index; } if (params->ema) link_conf->ema_ap = true; return 0; } static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, const char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct wireless_dev *wdev; struct ieee80211_sub_if_data *sdata; int err; err = ieee80211_if_add(local, name, name_assign_type, &wdev, type, params); if (err) return ERR_PTR(err); sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (type == NL80211_IFTYPE_MONITOR) { err = ieee80211_set_mon_options(sdata, params); if (err) { ieee80211_if_remove(sdata); return NULL; } } /* Let the driver know that an interface is going to be added. * Indicate so only for interface types that will be added to the * driver. */ switch (type) { case NL80211_IFTYPE_AP_VLAN: break; case NL80211_IFTYPE_MONITOR: if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || !(params->flags & MONITOR_FLAG_ACTIVE)) break; fallthrough; default: drv_prep_add_interface(local, ieee80211_vif_type_p2p(&sdata->vif)); break; } return wdev; } static int ieee80211_del_iface(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_if_remove(IEEE80211_WDEV_TO_SUB_IF(wdev)); return 0; } static int ieee80211_change_iface(struct wiphy *wiphy, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret; lockdep_assert_wiphy(local->hw.wiphy); ret = ieee80211_if_change_type(sdata, type); if (ret) return ret; if (type == NL80211_IFTYPE_AP_VLAN && params->use_4addr == 0) { RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); ieee80211_check_fast_rx_iface(sdata); } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (params->use_4addr == ifmgd->use_4addr) return 0; /* FIXME: no support for 4-addr MLO yet */ if (ieee80211_vif_is_mld(&sdata->vif)) return -EOPNOTSUPP; sdata->u.mgd.use_4addr = params->use_4addr; if (!ifmgd->associated) return 0; sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid); if (sta) drv_sta_set_4addr(local, sdata, &sta->sta, params->use_4addr); if (params->use_4addr) ieee80211_send_4addr_nullfunc(local, sdata); } if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { ret = ieee80211_set_mon_options(sdata, params); if (ret) return ret; } return 0; } static int ieee80211_start_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; return ieee80211_do_open(wdev, true); } static void ieee80211_stop_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } static int ieee80211_start_nan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; ret = ieee80211_do_open(wdev, true); if (ret) return ret; ret = drv_start_nan(sdata->local, sdata, conf); if (ret) ieee80211_sdata_stop(sdata); sdata->u.nan.conf = *conf; return ret; } static void ieee80211_stop_nan(struct wiphy *wiphy, struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); drv_stop_nan(sdata->local, sdata); ieee80211_sdata_stop(sdata); } static int ieee80211_nan_change_conf(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct cfg80211_nan_conf new_conf; int ret = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; new_conf = sdata->u.nan.conf; if (changes & CFG80211_NAN_CONF_CHANGED_PREF) new_conf.master_pref = conf->master_pref; if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) new_conf.bands = conf->bands; ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); if (!ret) sdata->u.nan.conf = new_conf; return ret; } static int ieee80211_add_nan_func(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; if (sdata->vif.type != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; spin_lock_bh(&sdata->u.nan.func_lock); ret = idr_alloc(&sdata->u.nan.function_inst_ids, nan_func, 1, sdata->local->hw.max_nan_de_entries + 1, GFP_ATOMIC); spin_unlock_bh(&sdata->u.nan.func_lock); if (ret < 0) return ret; nan_func->instance_id = ret; WARN_ON(nan_func->instance_id == 0); ret = drv_add_nan_func(sdata->local, sdata, nan_func); if (ret) { spin_lock_bh(&sdata->u.nan.func_lock); idr_remove(&sdata->u.nan.function_inst_ids, nan_func->instance_id); spin_unlock_bh(&sdata->u.nan.func_lock); } return ret; } static struct cfg80211_nan_func * ieee80211_find_nan_func_by_cookie(struct ieee80211_sub_if_data *sdata, u64 cookie) { struct cfg80211_nan_func *func; int id; lockdep_assert_held(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) { if (func->cookie == cookie) return func; } return NULL; } static void ieee80211_del_nan_func(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct cfg80211_nan_func *func; u8 instance_id = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN || !ieee80211_sdata_running(sdata)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = ieee80211_find_nan_func_by_cookie(sdata, cookie); if (func) instance_id = func->instance_id; spin_unlock_bh(&sdata->u.nan.func_lock); if (instance_id) drv_del_nan_func(sdata->local, sdata, instance_id); } static int ieee80211_set_noack_map(struct wiphy *wiphy, struct net_device *dev, u16 noack_map) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->noack_map = noack_map; ieee80211_check_fast_xmit_iface(sdata); return 0; } static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata, const u8 *mac_addr, u8 key_idx) { struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; struct sta_info *sta; int ret = -EINVAL; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) return -EINVAL; sta = sta_info_get_bss(sdata, mac_addr); if (!sta) return -EINVAL; if (sta->ptk_idx == key_idx) return 0; key = wiphy_dereference(local->hw.wiphy, sta->ptk[key_idx]); if (key && key->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) ret = ieee80211_set_tx_key(key); return ret; } static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, struct key_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); struct ieee80211_local *local = sdata->local; struct sta_info *sta = NULL; struct ieee80211_key *key; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; if (IS_ERR(link)) return PTR_ERR(link); if (pairwise && params->mode == NL80211_KEY_SET_TX) return ieee80211_set_tx(sdata, mac_addr, key_idx); /* reject WEP and TKIP keys if WEP failed to initialize */ switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_WEP104: if (link_id >= 0) return -EINVAL; if (WARN_ON_ONCE(fips_enabled)) return -EINVAL; break; default: break; } key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len, params->key, params->seq_len, params->seq); if (IS_ERR(key)) return PTR_ERR(key); key->conf.link_id = link_id; if (pairwise) key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; if (params->mode == NL80211_KEY_NO_TX) key->conf.flags |= IEEE80211_KEY_FLAG_NO_AUTO_TX; if (mac_addr) { sta = sta_info_get_bss(sdata, mac_addr); /* * The ASSOC test makes sure the driver is ready to * receive the key. When wpa_supplicant has roamed * using FT, it attempts to set the key before * association has completed, this rejects that attempt * so it will set the key again after association. * * TODO: accept the key if we have a station entry and * add it to the device after the station. */ if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) { ieee80211_key_free_unused(key); return -ENOENT; } } switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: /* no MFP (yet) */ break; case NL80211_IFTYPE_MESH_POINT: #ifdef CONFIG_MAC80211_MESH if (sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; #endif case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_OCB: /* shouldn't happen */ WARN_ON_ONCE(1); break; } err = ieee80211_key_link(key, link, sta); /* KRACK protection, shouldn't happen but just silently accept key */ if (err == -EALREADY) err = 0; return err; } static struct ieee80211_key * ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_local *local __maybe_unused = sdata->local; struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_key *key; if (link_id >= 0) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return NULL; } if (mac_addr) { struct sta_info *sta; struct link_sta_info *link_sta; sta = sta_info_get_bss(sdata, mac_addr); if (!sta) return NULL; if (link_id >= 0) { link_sta = rcu_dereference_check(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) return NULL; } else { link_sta = &sta->deflink; } if (pairwise && key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sta->ptk[key_idx]); if (!pairwise && key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS) return wiphy_dereference(local->hw.wiphy, link_sta->gtk[key_idx]); return NULL; } if (pairwise && key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sdata->keys[key_idx]); key = wiphy_dereference(local->hw.wiphy, link->gtk[key_idx]); if (key) return key; /* or maybe it was a WEP key */ if (key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sdata->keys[key_idx]); return NULL; } static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; lockdep_assert_wiphy(local->hw.wiphy); key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr); if (!key) return -ENOENT; ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION); return 0; } static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params *params)) { struct ieee80211_sub_if_data *sdata; u8 seq[6] = {0}; struct key_params params; struct ieee80211_key *key; u64 pn64; u32 iv32; u16 iv16; int err = -ENOENT; struct ieee80211_key_seq kseq = {}; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr); if (!key) goto out; memset(&params, 0, sizeof(params)); params.cipher = key->conf.cipher; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: pn64 = atomic64_read(&key->conf.tx_pn); iv32 = TKIP_PN_TO_IV32(pn64); iv16 = TKIP_PN_TO_IV16(pn64); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); iv32 = kseq.tkip.iv32; iv16 = kseq.tkip.iv16; } seq[0] = iv16 & 0xff; seq[1] = (iv16 >> 8) & 0xff; seq[2] = iv32 & 0xff; seq[3] = (iv32 >> 8) & 0xff; seq[4] = (iv32 >> 16) & 0xff; seq[5] = (iv32 >> 24) & 0xff; params.seq = seq; params.seq_len = 6; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_cmac)); fallthrough; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_gmac)); fallthrough; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), gcmp)); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); memcpy(seq, kseq.ccmp.pn, 6); } else { pn64 = atomic64_read(&key->conf.tx_pn); seq[0] = pn64; seq[1] = pn64 >> 8; seq[2] = pn64 >> 16; seq[3] = pn64 >> 24; seq[4] = pn64 >> 32; seq[5] = pn64 >> 40; } params.seq = seq; params.seq_len = 6; break; default: if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) break; if (WARN_ON(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) break; drv_get_key_seq(sdata->local, key, &kseq); params.seq = kseq.hw.seq; params.seq_len = kseq.hw.seq_len; break; } callback(cookie, &params); err = 0; out: rcu_read_unlock(); return err; } static int ieee80211_config_default_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool uni, bool multi) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_key(link, key_idx, uni, multi); return 0; } static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_mgmt_key(link, key_idx); return 0; } static int ieee80211_config_default_beacon_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_beacon_key(link, key_idx); return 0; } void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo) { rinfo->flags = 0; if (rate->flags & IEEE80211_TX_RC_MCS) { rinfo->flags |= RATE_INFO_FLAGS_MCS; rinfo->mcs = rate->idx; } else if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = ieee80211_rate_get_vht_mcs(rate); rinfo->nss = ieee80211_rate_get_vht_nss(rate); } else { struct ieee80211_supported_band *sband; sband = ieee80211_get_sband(sta->sdata); WARN_ON_ONCE(sband && !sband->bitrates); if (sband && sband->bitrates) rinfo->legacy = sband->bitrates[rate->idx].bitrate; } if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_40; else if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_80; else if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_160; else rinfo->bw = RATE_INFO_BW_20; if (rate->flags & IEEE80211_TX_RC_SHORT_GI) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; } static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_by_idx(sdata, idx); if (sta) { ret = 0; memcpy(mac, sta->sta.addr, ETH_ALEN); sta_set_sinfo(sta, sinfo, true); } return ret; } static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev, int idx, struct survey_info *survey) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); return drv_get_survey(local, idx, survey); } static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, mac); if (sta) { ret = 0; sta_set_sinfo(sta, sinfo, true); } return ret; } static int ieee80211_set_monitor_channel(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; struct ieee80211_chan_req chanreq = { .oper = *chandef }; int ret; lockdep_assert_wiphy(local->hw.wiphy); sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, &chanreq.oper)) return 0; sdata = wiphy_dereference(wiphy, local->monitor_sdata); if (!sdata) goto done; } if (rcu_access_pointer(sdata->deflink.conf->chanctx_conf) && cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, &chanreq.oper)) return 0; ieee80211_link_release_channel(&sdata->deflink); ret = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (ret) return ret; done: local->monitor_chanreq = chanreq; return 0; } static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, const u8 *resp, size_t resp_len, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca, struct ieee80211_link_data *link) { struct probe_resp *new, *old; if (!resp || !resp_len) return 1; old = sdata_dereference(link->u.ap.probe_resp, sdata); new = kzalloc(sizeof(struct probe_resp) + resp_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = resp_len; memcpy(new->data, resp, resp_len); if (csa) memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp, csa->n_counter_offsets_presp * sizeof(new->cntdwn_counter_offsets[0])); else if (cca) new->cntdwn_counter_offsets[0] = cca->counter_offset_presp; rcu_assign_pointer(link->u.ap.probe_resp, new); if (old) kfree_rcu(old, rcu_head); return 0; } static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata, struct cfg80211_fils_discovery *params, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf, u64 *changed) { struct fils_discovery_data *new, *old = NULL; struct ieee80211_fils_discovery *fd; if (!params->update) return 0; fd = &link_conf->fils_discovery; fd->min_interval = params->min_interval; fd->max_interval = params->max_interval; old = sdata_dereference(link->u.ap.fils_discovery, sdata); if (old) kfree_rcu(old, rcu_head); if (params->tmpl && params->tmpl_len) { new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = params->tmpl_len; memcpy(new->data, params->tmpl, params->tmpl_len); rcu_assign_pointer(link->u.ap.fils_discovery, new); } else { RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); } *changed |= BSS_CHANGED_FILS_DISCOVERY; return 0; } static int ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, struct cfg80211_unsol_bcast_probe_resp *params, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf, u64 *changed) { struct unsol_bcast_probe_resp_data *new, *old = NULL; if (!params->update) return 0; link_conf->unsol_bcast_probe_resp_interval = params->interval; old = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); if (old) kfree_rcu(old, rcu_head); if (params->tmpl && params->tmpl_len) { new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = params->tmpl_len; memcpy(new->data, params->tmpl, params->tmpl_len); rcu_assign_pointer(link->u.ap.unsol_bcast_probe_resp, new); } else { RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); } *changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP; return 0; } static int ieee80211_set_ftm_responder_params( struct ieee80211_sub_if_data *sdata, const u8 *lci, size_t lci_len, const u8 *civicloc, size_t civicloc_len, struct ieee80211_bss_conf *link_conf) { struct ieee80211_ftm_responder_params *new, *old; u8 *pos; int len; if (!lci_len && !civicloc_len) return 0; old = link_conf->ftmr_params; len = lci_len + civicloc_len; new = kzalloc(sizeof(*new) + len, GFP_KERNEL); if (!new) return -ENOMEM; pos = (u8 *)(new + 1); if (lci_len) { new->lci_len = lci_len; new->lci = pos; memcpy(pos, lci, lci_len); pos += lci_len; } if (civicloc_len) { new->civicloc_len = civicloc_len; new->civicloc = pos; memcpy(pos, civicloc, civicloc_len); pos += civicloc_len; } link_conf->ftmr_params = new; kfree(old); return 0; } static int ieee80211_copy_mbssid_beacon(u8 *pos, struct cfg80211_mbssid_elems *dst, struct cfg80211_mbssid_elems *src) { int i, offset = 0; dst->cnt = src->cnt; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } return offset; } static int ieee80211_copy_rnr_beacon(u8 *pos, struct cfg80211_rnr_elems *dst, struct cfg80211_rnr_elems *src) { int i, offset = 0; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } dst->cnt = src->cnt; return offset; } static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct cfg80211_beacon_data *params, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca, u64 *changed) { struct cfg80211_mbssid_elems *mbssid = NULL; struct cfg80211_rnr_elems *rnr = NULL; struct beacon_data *new, *old; int new_head_len, new_tail_len; int size, err; u64 _changed = BSS_CHANGED_BEACON; struct ieee80211_bss_conf *link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); /* Need to have a beacon head if we don't have one yet */ if (!params->head && !old) return -EINVAL; /* new or old head? */ if (params->head) new_head_len = params->head_len; else new_head_len = old->head_len; /* new or old tail? */ if (params->tail || !old) /* params->tail_len will be zero for !params->tail */ new_tail_len = params->tail_len; else new_tail_len = old->tail_len; size = sizeof(*new) + new_head_len + new_tail_len; /* new or old multiple BSSID elements? */ if (params->mbssid_ies) { mbssid = params->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); if (params->rnr_ies) { rnr = params->rnr_ies; size += struct_size(new->rnr_ies, elem, rnr->cnt); } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); } else if (old && old->mbssid_ies) { mbssid = old->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); if (old && old->rnr_ies) { rnr = old->rnr_ies; size += struct_size(new->rnr_ies, elem, rnr->cnt); } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); } new = kzalloc(size, GFP_KERNEL); if (!new) return -ENOMEM; /* start filling the new info now */ /* * pointers go into the block we allocated, * memory is | beacon_data | head | tail | mbssid_ies | rnr_ies */ new->head = ((u8 *) new) + sizeof(*new); new->tail = new->head + new_head_len; new->head_len = new_head_len; new->tail_len = new_tail_len; /* copy in optional mbssid_ies */ if (mbssid) { u8 *pos = new->tail + new->tail_len; new->mbssid_ies = (void *)pos; pos += struct_size(new->mbssid_ies, elem, mbssid->cnt); pos += ieee80211_copy_mbssid_beacon(pos, new->mbssid_ies, mbssid); if (rnr) { new->rnr_ies = (void *)pos; pos += struct_size(new->rnr_ies, elem, rnr->cnt); ieee80211_copy_rnr_beacon(pos, new->rnr_ies, rnr); } /* update bssid_indicator */ link_conf->bssid_indicator = ilog2(__roundup_pow_of_two(mbssid->cnt + 1)); } if (csa) { new->cntdwn_current_counter = csa->count; memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon, csa->n_counter_offsets_beacon * sizeof(new->cntdwn_counter_offsets[0])); } else if (cca) { new->cntdwn_current_counter = cca->count; new->cntdwn_counter_offsets[0] = cca->counter_offset_beacon; } /* copy in head */ if (params->head) memcpy(new->head, params->head, new_head_len); else memcpy(new->head, old->head, new_head_len); /* copy in optional tail */ if (params->tail) memcpy(new->tail, params->tail, new_tail_len); else if (old) memcpy(new->tail, old->tail, new_tail_len); err = ieee80211_set_probe_resp(sdata, params->probe_resp, params->probe_resp_len, csa, cca, link); if (err < 0) { kfree(new); return err; } if (err == 0) _changed |= BSS_CHANGED_AP_PROBE_RESP; if (params->ftm_responder != -1) { link_conf->ftm_responder = params->ftm_responder; err = ieee80211_set_ftm_responder_params(sdata, params->lci, params->lci_len, params->civicloc, params->civicloc_len, link_conf); if (err < 0) { kfree(new); return err; } _changed |= BSS_CHANGED_FTM_RESPONDER; } rcu_assign_pointer(link->u.ap.beacon, new); sdata->u.ap.active = true; if (old) kfree_rcu(old, rcu_head); *changed |= _changed; return 0; } static u8 ieee80211_num_beaconing_links(struct ieee80211_sub_if_data *sdata) { struct ieee80211_link_data *link; u8 link_id, num = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_P2P_GO) return num; if (!sdata->vif.valid_links) return num; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (sdata_dereference(link->u.ap.beacon, sdata)) num++; } return num; } static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct beacon_data *old; struct ieee80211_sub_if_data *vlan; u64 changed = BSS_CHANGED_BEACON_INT | BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_BEACON | BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER | BSS_CHANGED_TWT; int i, err; int prev_beacon_int; unsigned int link_id = params->beacon.link_id; struct ieee80211_link_data *link; struct ieee80211_bss_conf *link_conf; struct ieee80211_chan_req chanreq = { .oper = params->chandef }; lockdep_assert_wiphy(local->hw.wiphy); link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return -ENOLINK; link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); if (old) return -EALREADY; link->smps_mode = IEEE80211_SMPS_OFF; link->needed_rx_chains = sdata->local->rx_chains; prev_beacon_int = link_conf->beacon_int; link_conf->beacon_int = params->beacon_interval; if (params->ht_cap) link_conf->ht_ldpc = params->ht_cap->cap_info & cpu_to_le16(IEEE80211_HT_CAP_LDPC_CODING); if (params->vht_cap) { link_conf->vht_ldpc = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC); link_conf->vht_su_beamformer = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); link_conf->vht_su_beamformee = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); link_conf->vht_mu_beamformer = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE); link_conf->vht_mu_beamformee = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); } if (params->he_cap && params->he_oper) { link_conf->he_support = true; link_conf->htc_trig_based_pkt_ext = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); link_conf->frame_time_rts_th = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); changed |= BSS_CHANGED_HE_OBSS_PD; if (params->beacon.he_bss_color.enabled) changed |= BSS_CHANGED_HE_BSS_COLOR; } if (params->he_cap) { link_conf->he_ldpc = params->he_cap->phy_cap_info[1] & IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD; link_conf->he_su_beamformer = params->he_cap->phy_cap_info[3] & IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER; link_conf->he_su_beamformee = params->he_cap->phy_cap_info[4] & IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE; link_conf->he_mu_beamformer = params->he_cap->phy_cap_info[4] & IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER; link_conf->he_full_ul_mumimo = params->he_cap->phy_cap_info[2] & IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO; } if (params->eht_cap) { if (!link_conf->he_support) return -EOPNOTSUPP; link_conf->eht_support = true; link_conf->eht_su_beamformer = params->eht_cap->fixed.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER; link_conf->eht_su_beamformee = params->eht_cap->fixed.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE; link_conf->eht_mu_beamformer = params->eht_cap->fixed.phy_cap_info[7] & (IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ); link_conf->eht_80mhz_full_bw_ul_mumimo = params->eht_cap->fixed.phy_cap_info[7] & (IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ); } else { link_conf->eht_su_beamformer = false; link_conf->eht_su_beamformee = false; link_conf->eht_mu_beamformer = false; } if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, &params->mbssid_config, link_conf); if (err) return err; } err = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); if (!err) ieee80211_link_copy_chanctx_to_vlans(link, false); if (err) { link_conf->beacon_int = prev_beacon_int; return err; } /* * Apply control port protocol, this allows us to * not encrypt dynamic WEP control frames. */ sdata->control_port_protocol = params->crypto.control_port_ethertype; sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt; sdata->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; sdata->control_port_no_preauth = params->crypto.control_port_no_preauth; list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { vlan->control_port_protocol = params->crypto.control_port_ethertype; vlan->control_port_no_encrypt = params->crypto.control_port_no_encrypt; vlan->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; vlan->control_port_no_preauth = params->crypto.control_port_no_preauth; } link_conf->dtim_period = params->dtim_period; link_conf->enable_beacon = true; link_conf->allow_p2p_go_ps = sdata->vif.p2p; link_conf->twt_responder = params->twt_responder; link_conf->he_obss_pd = params->he_obss_pd; link_conf->he_bss_color = params->beacon.he_bss_color; sdata->vif.cfg.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ; sdata->vif.cfg.ssid_len = params->ssid_len; if (params->ssid_len) memcpy(sdata->vif.cfg.ssid, params->ssid, params->ssid_len); link_conf->hidden_ssid = (params->hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE); memset(&link_conf->p2p_noa_attr, 0, sizeof(link_conf->p2p_noa_attr)); link_conf->p2p_noa_attr.oppps_ctwindow = params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; if (params->p2p_opp_ps) link_conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; sdata->beacon_rate_set = false; if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { for (i = 0; i < NUM_NL80211_BANDS; i++) { sdata->beacon_rateidx_mask[i] = params->beacon_rate.control[i].legacy; if (sdata->beacon_rateidx_mask[i]) sdata->beacon_rate_set = true; } } if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) link_conf->beacon_tx_rate = params->beacon_rate; err = ieee80211_assign_beacon(sdata, link, &params->beacon, NULL, NULL, &changed); if (err < 0) goto error; err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery, link, link_conf, &changed); if (err < 0) goto error; err = ieee80211_set_unsol_bcast_probe_resp(sdata, &params->unsol_bcast_probe_resp, link, link_conf, &changed); if (err < 0) goto error; err = drv_start_ap(sdata->local, sdata, link_conf); if (err) { old = sdata_dereference(link->u.ap.beacon, sdata); if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(link->u.ap.beacon, NULL); if (ieee80211_num_beaconing_links(sdata) == 0) sdata->u.ap.active = false; goto error; } ieee80211_recalc_dtim(local, sdata); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); ieee80211_link_info_change_notify(sdata, link, changed); if (ieee80211_num_beaconing_links(sdata) <= 1) netif_carrier_on(dev); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_on(vlan->dev); return 0; error: ieee80211_link_release_channel(link); return err; } static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_update *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct cfg80211_beacon_data *beacon = &params->beacon; struct beacon_data *old; int err; struct ieee80211_bss_conf *link_conf; u64 changed = 0; lockdep_assert_wiphy(wiphy); link = sdata_dereference(sdata->link[beacon->link_id], sdata); if (!link) return -ENOLINK; link_conf = link->conf; /* don't allow changing the beacon while a countdown is in place - offset * of channel switch counter may change */ if (link_conf->csa_active || link_conf->color_change_active) return -EBUSY; old = sdata_dereference(link->u.ap.beacon, sdata); if (!old) return -ENOENT; err = ieee80211_assign_beacon(sdata, link, beacon, NULL, NULL, &changed); if (err < 0) return err; err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery, link, link_conf, &changed); if (err < 0) return err; err = ieee80211_set_unsol_bcast_probe_resp(sdata, &params->unsol_bcast_probe_resp, link, link_conf, &changed); if (err < 0) return err; if (beacon->he_bss_color_valid && beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) { link_conf->he_bss_color.enabled = beacon->he_bss_color.enabled; changed |= BSS_CHANGED_HE_BSS_COLOR; } ieee80211_link_info_change_notify(sdata, link, changed); return 0; } static void ieee80211_free_next_beacon(struct ieee80211_link_data *link) { if (!link->u.ap.next_beacon) return; kfree(link->u.ap.next_beacon->mbssid_ies); kfree(link->u.ap.next_beacon->rnr_ies); kfree(link->u.ap.next_beacon); link->u.ap.next_beacon = NULL; } static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_sub_if_data *vlan; struct ieee80211_local *local = sdata->local; struct beacon_data *old_beacon; struct probe_resp *old_probe_resp; struct fils_discovery_data *old_fils_discovery; struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct ieee80211_bss_conf *link_conf = link->conf; LIST_HEAD(keys); lockdep_assert_wiphy(local->hw.wiphy); old_beacon = sdata_dereference(link->u.ap.beacon, sdata); if (!old_beacon) return -ENOENT; old_probe_resp = sdata_dereference(link->u.ap.probe_resp, sdata); old_fils_discovery = sdata_dereference(link->u.ap.fils_discovery, sdata); old_unsol_bcast_probe_resp = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); /* abort any running channel switch or color change */ link_conf->csa_active = false; link_conf->color_change_active = false; ieee80211_vif_unblock_queues_csa(sdata); ieee80211_free_next_beacon(link); /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); if (ieee80211_num_beaconing_links(sdata) <= 1) { netif_carrier_off(dev); sdata->u.ap.active = false; } /* remove beacon and probe response */ RCU_INIT_POINTER(link->u.ap.beacon, NULL); RCU_INIT_POINTER(link->u.ap.probe_resp, NULL); RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); if (old_fils_discovery) kfree_rcu(old_fils_discovery, rcu_head); if (old_unsol_bcast_probe_resp) kfree_rcu(old_unsol_bcast_probe_resp, rcu_head); kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; __sta_info_flush(sdata, true, link_id); ieee80211_remove_link_keys(link, &keys); if (!list_empty(&keys)) { synchronize_net(); ieee80211_free_key_list(local, &keys); } link_conf->enable_beacon = false; sdata->beacon_rate_set = false; sdata->vif.cfg.ssid_len = 0; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BEACON_ENABLED); if (sdata->wdev.links[link_id].cac_started) { chandef = link_conf->chanreq.oper; wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); } drv_stop_ap(sdata->local, sdata, link_conf); /* free all potentially still buffered bcast frames */ local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf); ieee80211_purge_tx_queue(&local->hw, &sdata->u.ap.ps.bc_buf); ieee80211_link_copy_chanctx_to_vlans(link, true); ieee80211_link_release_channel(link); return 0; } static int sta_apply_auth_flags(struct ieee80211_local *local, struct sta_info *sta, u32 mask, u32 set) { int ret; if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && set & BIT(NL80211_STA_FLAG_AUTHENTICATED) && !test_sta_flag(sta, WLAN_STA_AUTH)) { ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && set & BIT(NL80211_STA_FLAG_ASSOCIATED) && !test_sta_flag(sta, WLAN_STA_ASSOC)) { /* * When peer becomes associated, init rate control as * well. Some drivers require rate control initialized * before drv_sta_state() is called. */ if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) rate_control_rate_init_all_links(sta); ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) ret = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); else if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); else ret = 0; if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && !(set & BIT(NL80211_STA_FLAG_ASSOCIATED)) && test_sta_flag(sta, WLAN_STA_ASSOC)) { ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && !(set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) && test_sta_flag(sta, WLAN_STA_AUTH)) { ret = sta_info_move_state(sta, IEEE80211_STA_NONE); if (ret) return ret; } return 0; } static void sta_apply_mesh_params(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { #ifdef CONFIG_MAC80211_MESH struct ieee80211_sub_if_data *sdata = sta->sdata; u64 changed = 0; if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { switch (params->plink_state) { case NL80211_PLINK_ESTAB: if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) changed = mesh_plink_inc_estab_count(sdata); sta->mesh->plink_state = params->plink_state; sta->mesh->aid = params->peer_aid; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, sdata->u.mesh.mshcfg.power_mode); ewma_mesh_tx_rate_avg_init(&sta->mesh->tx_rate_avg); /* init at low value */ ewma_mesh_tx_rate_avg_add(&sta->mesh->tx_rate_avg, 10); break; case NL80211_PLINK_LISTEN: case NL80211_PLINK_BLOCKED: case NL80211_PLINK_OPN_SNT: case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_CNF_RCVD: case NL80211_PLINK_HOLDING: if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count(sdata); sta->mesh->plink_state = params->plink_state; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, NL80211_MESH_POWER_UNKNOWN); break; default: /* nothing */ break; } } switch (params->plink_action) { case NL80211_PLINK_ACTION_NO_ACTION: /* nothing */ break; case NL80211_PLINK_ACTION_OPEN: changed |= mesh_plink_open(sta); break; case NL80211_PLINK_ACTION_BLOCK: changed |= mesh_plink_block(sta); break; } if (params->local_pm) changed |= ieee80211_mps_set_sta_local_pm(sta, params->local_pm); ieee80211_mbss_info_change_notify(sdata, changed); #endif } enum sta_link_apply_mode { STA_LINK_MODE_NEW, STA_LINK_MODE_STA_MODIFY, STA_LINK_MODE_LINK_MODIFY, }; static int sta_link_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, enum sta_link_apply_mode mode, struct link_station_parameters *params) { struct ieee80211_supported_band *sband; struct ieee80211_sub_if_data *sdata = sta->sdata; u32 link_id = params->link_id < 0 ? 0 : params->link_id; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct link_sta_info *link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); bool changes = params->link_mac || params->txpwr_set || params->supported_rates_len || params->ht_capa || params->vht_capa || params->he_capa || params->eht_capa || params->opmode_notif_used; switch (mode) { case STA_LINK_MODE_NEW: if (!params->link_mac) return -EINVAL; break; case STA_LINK_MODE_LINK_MODIFY: break; case STA_LINK_MODE_STA_MODIFY: if (params->link_id >= 0) break; if (!changes) return 0; break; } if (!link || !link_sta) return -EINVAL; sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->link_mac) { if (mode == STA_LINK_MODE_NEW) { memcpy(link_sta->addr, params->link_mac, ETH_ALEN); memcpy(link_sta->pub->addr, params->link_mac, ETH_ALEN); } else if (!ether_addr_equal(link_sta->addr, params->link_mac)) { return -EINVAL; } } if (params->txpwr_set) { int ret; link_sta->pub->txpwr.type = params->txpwr.type; if (params->txpwr.type == NL80211_TX_POWER_LIMITED) link_sta->pub->txpwr.power = params->txpwr.power; ret = drv_sta_set_txpwr(local, sdata, sta); if (ret) return ret; } if (params->supported_rates && params->supported_rates_len) { ieee80211_parse_bitrates(link->conf->chanreq.oper.width, sband, params->supported_rates, params->supported_rates_len, &link_sta->pub->supp_rates[sband->band]); } if (params->ht_capa) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, params->ht_capa, link_sta); /* VHT can override some HT caps such as the A-MSDU max length */ if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, NULL, link_sta); if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, (void *)params->he_capa, params->he_capa_len, (void *)params->he_6ghz_capa, link_sta); if (params->he_capa && params->eht_capa) ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband, (u8 *)params->he_capa, params->he_capa_len, params->eht_capa, params->eht_capa_len, link_sta); ieee80211_sta_init_nss(link_sta); if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it */ __ieee80211_vht_handle_opmode(sdata, link_sta, params->opmode_notif, sband->band); } return 0; } static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = sta->sdata; u32 mask, set; int ret = 0; mask = params->sta_flags_mask; set = params->sta_flags_set; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* * In mesh mode, ASSOCIATED isn't part of the nl80211 * API but must follow AUTHENTICATED for driver state. */ if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) mask |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) set |= BIT(NL80211_STA_FLAG_ASSOCIATED); } else if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* * TDLS -- everything follows authorized, but * only becoming authorized is possible, not * going back */ if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) { set |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED); mask |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED); } } if (mask & BIT(NL80211_STA_FLAG_WME) && local->hw.queues >= IEEE80211_NUM_ACS) sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); /* auth flags will be set later for TDLS, * and for unassociated stations that move to associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) && (set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) { if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) set_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); else clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); } if (mask & BIT(NL80211_STA_FLAG_MFP)) { sta->sta.mfp = !!(set & BIT(NL80211_STA_FLAG_MFP)); if (set & BIT(NL80211_STA_FLAG_MFP)) set_sta_flag(sta, WLAN_STA_MFP); else clear_sta_flag(sta, WLAN_STA_MFP); } if (mask & BIT(NL80211_STA_FLAG_TDLS_PEER)) { if (set & BIT(NL80211_STA_FLAG_TDLS_PEER)) set_sta_flag(sta, WLAN_STA_TDLS_PEER); else clear_sta_flag(sta, WLAN_STA_TDLS_PEER); } if (mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) sta->sta.spp_amsdu = set & BIT(NL80211_STA_FLAG_SPP_AMSDU); /* mark TDLS channel switch support, if the AP allows it */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->deflink.u.mgd.tdls_chan_switch_prohibited && params->ext_capab_len >= 4 && params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH) set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->u.mgd.tdls_wider_bw_prohibited && ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && params->ext_capab_len >= 8 && params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED) set_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW); if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) { sta->sta.uapsd_queues = params->uapsd_queues; sta->sta.max_sp = params->max_sp; } ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab, params->ext_capab_len); /* * cfg80211 validates this (1-2007) and allows setting the AID * only when creating a new station entry */ if (params->aid) sta->sta.aid = params->aid; /* * Some of the following updates would be racy if called on an * existing station, via ieee80211_change_station(). However, * all such changes are rejected by cfg80211 except for updates * changing the supported rates on an existing but not yet used * TDLS peer. */ if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_STA_MODIFY, &params->link_sta_params); if (ret) return ret; if (params->support_p2p_ps >= 0) sta->sta.support_p2p_ps = params->support_p2p_ps; if (ieee80211_vif_is_mesh(&sdata->vif)) sta_apply_mesh_params(local, sta, params); if (params->airtime_weight) sta->airtime_weight = params->airtime_weight; /* set the STA state after all sta info from usermode has been set */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) || set & BIT(NL80211_STA_FLAG_ASSOCIATED)) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; } /* Mark the STA as MLO if MLD MAC address is available */ if (params->link_sta_params.mld_mac) sta->sta.mlo = true; return 0; } static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *sdata; int err; lockdep_assert_wiphy(local->hw.wiphy); if (params->vlan) { sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && sdata->vif.type != NL80211_IFTYPE_AP) return -EINVAL; } else sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (ether_addr_equal(mac, sdata->vif.addr)) return -EINVAL; if (!is_valid_ether_addr(mac)) return -EINVAL; if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER) && sdata->vif.type == NL80211_IFTYPE_STATION && !sdata->u.mgd.associated) return -EINVAL; /* * If we have a link ID, it can be a non-MLO station on an AP MLD, * but we need to have a link_mac in that case as well, so use the * STA's MAC address in that case. */ if (params->link_sta_params.link_id >= 0) sta = sta_info_alloc_with_link(sdata, mac, params->link_sta_params.link_id, params->link_sta_params.link_mac ?: mac, GFP_KERNEL); else sta = sta_info_alloc(sdata, mac, GFP_KERNEL); if (!sta) return -ENOMEM; if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) sta->sta.tdls = true; /* Though the mutex is not needed here (since the station is not * visible yet), sta_apply_parameters (and inner functions) require * the mutex due to other paths. */ err = sta_apply_parameters(local, sta, params); if (err) { sta_info_free(local, sta); return err; } /* * for TDLS and for unassociated station, rate control should be * initialized only when rates are known and station is marked * authorized/associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init_all_links(sta); return sta_info_insert(sta); } static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, struct station_del_parameters *params) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (params->mac) return sta_info_destroy_addr_bss(sdata, params->mac); sta_info_flush(sdata, params->link_id); return 0; } static int ieee80211_change_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *vlansdata; enum cfg80211_station_type statype; int err; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, mac); if (!sta) return -ENOENT; switch (sdata->vif.type) { case NL80211_IFTYPE_MESH_POINT: if (sdata->u.mesh.user_mpm) statype = CFG80211_STA_MESH_PEER_USER; else statype = CFG80211_STA_MESH_PEER_KERNEL; break; case NL80211_IFTYPE_ADHOC: statype = CFG80211_STA_IBSS; break; case NL80211_IFTYPE_STATION: if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { statype = CFG80211_STA_AP_STA; break; } if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) statype = CFG80211_STA_TDLS_PEER_ACTIVE; else statype = CFG80211_STA_TDLS_PEER_SETUP; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: if (test_sta_flag(sta, WLAN_STA_ASSOC)) statype = CFG80211_STA_AP_CLIENT; else statype = CFG80211_STA_AP_CLIENT_UNASSOC; break; default: return -EOPNOTSUPP; } err = cfg80211_check_station_change(wiphy, params, statype); if (err) return err; if (params->vlan && params->vlan != sta->sdata->dev) { vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); if (params->vlan->ieee80211_ptr->use_4addr) { if (vlansdata->u.vlan.sta) return -EBUSY; rcu_assign_pointer(vlansdata->u.vlan.sta, sta); __ieee80211_check_fast_rx_iface(vlansdata); drv_sta_set_4addr(local, sta->sdata, &sta->sta, true); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sta->sdata->u.vlan.sta) RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ieee80211_vif_dec_num_mcast(sta->sdata); sta->sdata = vlansdata; ieee80211_check_fast_rx(sta); ieee80211_check_fast_xmit(sta); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { ieee80211_vif_inc_num_mcast(sta->sdata); cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); } } err = sta_apply_parameters(local, sta, params); if (err) return err; if (sdata->vif.type == NL80211_IFTYPE_STATION && params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); } return 0; } #ifdef CONFIG_MAC80211_MESH static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); sta = sta_info_get(sdata, next_hop); if (!sta) { rcu_read_unlock(); return -ENOENT; } mpath = mesh_path_add(sdata, dst); if (IS_ERR(mpath)) { rcu_read_unlock(); return PTR_ERR(mpath); } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); return 0; } static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (dst) return mesh_path_del(sdata, dst); mesh_path_flush_by_iface(sdata); return 0; } static int ieee80211_change_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); sta = sta_info_get(sdata, next_hop); if (!sta) { rcu_read_unlock(); return -ENOENT; } mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); return 0; } static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, struct mpath_info *pinfo) { struct sta_info *next_hop_sta = rcu_dereference(mpath->next_hop); if (next_hop_sta) memcpy(next_hop, next_hop_sta->sta.addr, ETH_ALEN); else eth_zero_addr(next_hop); memset(pinfo, 0, sizeof(*pinfo)); pinfo->generation = mpath->sdata->u.mesh.mesh_paths_generation; pinfo->filled = MPATH_INFO_FRAME_QLEN | MPATH_INFO_SN | MPATH_INFO_METRIC | MPATH_INFO_EXPTIME | MPATH_INFO_DISCOVERY_TIMEOUT | MPATH_INFO_DISCOVERY_RETRIES | MPATH_INFO_FLAGS | MPATH_INFO_HOP_COUNT | MPATH_INFO_PATH_CHANGE; pinfo->frame_qlen = mpath->frame_queue.qlen; pinfo->sn = mpath->sn; pinfo->metric = mpath->metric; if (time_before(jiffies, mpath->exp_time)) pinfo->exptime = jiffies_to_msecs(mpath->exp_time - jiffies); pinfo->discovery_timeout = jiffies_to_msecs(mpath->discovery_timeout); pinfo->discovery_retries = mpath->discovery_retries; if (mpath->flags & MESH_PATH_ACTIVE) pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE; if (mpath->flags & MESH_PATH_RESOLVING) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING; if (mpath->flags & MESH_PATH_SN_VALID) pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID; if (mpath->flags & MESH_PATH_FIXED) pinfo->flags |= NL80211_MPATH_FLAG_FIXED; if (mpath->flags & MESH_PATH_RESOLVED) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED; pinfo->hop_count = mpath->hop_count; pinfo->path_change_count = mpath->path_change_count; } static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpath_set_pinfo(mpath, next_hop, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mesh_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpath_set_pinfo(mpath, next_hop, pinfo); rcu_read_unlock(); return 0; } static void mpp_set_pinfo(struct mesh_path *mpath, u8 *mpp, struct mpath_info *pinfo) { memset(pinfo, 0, sizeof(*pinfo)); memcpy(mpp, mpath->mpp, ETH_ALEN); pinfo->generation = mpath->sdata->u.mesh.mpp_paths_generation; } static int ieee80211_get_mpp(struct wiphy *wiphy, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mpp_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpp_set_pinfo(mpath, mpp, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_dump_mpp(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mpp_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpp_set_pinfo(mpath, mpp, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_get_mesh_config(struct wiphy *wiphy, struct net_device *dev, struct mesh_config *conf) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(conf, &(sdata->u.mesh.mshcfg), sizeof(struct mesh_config)); return 0; } static inline bool _chg_mesh_attr(enum nl80211_meshconf_params parm, u32 mask) { return (mask >> (parm-1)) & 0x1; } static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, const struct mesh_setup *setup) { u8 *new_ie; struct ieee80211_sub_if_data *sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); int i; /* allocate information elements */ new_ie = NULL; if (setup->ie_len) { new_ie = kmemdup(setup->ie, setup->ie_len, GFP_KERNEL); if (!new_ie) return -ENOMEM; } ifmsh->ie_len = setup->ie_len; ifmsh->ie = new_ie; /* now copy the rest of the setup parameters */ ifmsh->mesh_id_len = setup->mesh_id_len; memcpy(ifmsh->mesh_id, setup->mesh_id, ifmsh->mesh_id_len); ifmsh->mesh_sp_id = setup->sync_method; ifmsh->mesh_pp_id = setup->path_sel_proto; ifmsh->mesh_pm_id = setup->path_metric; ifmsh->user_mpm = setup->user_mpm; ifmsh->mesh_auth_id = setup->auth_id; ifmsh->security = IEEE80211_MESH_SEC_NONE; ifmsh->userspace_handles_dfs = setup->userspace_handles_dfs; if (setup->is_authenticated) ifmsh->security |= IEEE80211_MESH_SEC_AUTHED; if (setup->is_secure) ifmsh->security |= IEEE80211_MESH_SEC_SECURED; /* mcast rate setting in Mesh Node */ memcpy(sdata->vif.bss_conf.mcast_rate, setup->mcast_rate, sizeof(setup->mcast_rate)); sdata->vif.bss_conf.basic_rates = setup->basic_rates; sdata->vif.bss_conf.beacon_int = setup->beacon_interval; sdata->vif.bss_conf.dtim_period = setup->dtim_period; sdata->beacon_rate_set = false; if (wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { for (i = 0; i < NUM_NL80211_BANDS; i++) { sdata->beacon_rateidx_mask[i] = setup->beacon_rate.control[i].legacy; if (sdata->beacon_rateidx_mask[i]) sdata->beacon_rate_set = true; } } return 0; } static int ieee80211_update_mesh_config(struct wiphy *wiphy, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { struct mesh_config *conf; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_mesh *ifmsh; sdata = IEEE80211_DEV_TO_SUB_IF(dev); ifmsh = &sdata->u.mesh; /* Set the config options which we are interested in setting */ conf = &(sdata->u.mesh.mshcfg); if (_chg_mesh_attr(NL80211_MESHCONF_RETRY_TIMEOUT, mask)) conf->dot11MeshRetryTimeout = nconf->dot11MeshRetryTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_CONFIRM_TIMEOUT, mask)) conf->dot11MeshConfirmTimeout = nconf->dot11MeshConfirmTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HOLDING_TIMEOUT, mask)) conf->dot11MeshHoldingTimeout = nconf->dot11MeshHoldingTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_MAX_PEER_LINKS, mask)) conf->dot11MeshMaxPeerLinks = nconf->dot11MeshMaxPeerLinks; if (_chg_mesh_attr(NL80211_MESHCONF_MAX_RETRIES, mask)) conf->dot11MeshMaxRetries = nconf->dot11MeshMaxRetries; if (_chg_mesh_attr(NL80211_MESHCONF_TTL, mask)) conf->dot11MeshTTL = nconf->dot11MeshTTL; if (_chg_mesh_attr(NL80211_MESHCONF_ELEMENT_TTL, mask)) conf->element_ttl = nconf->element_ttl; if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask)) { if (ifmsh->user_mpm) return -EBUSY; conf->auto_open_plinks = nconf->auto_open_plinks; } if (_chg_mesh_attr(NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, mask)) conf->dot11MeshNbrOffsetMaxNeighbor = nconf->dot11MeshNbrOffsetMaxNeighbor; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, mask)) conf->dot11MeshHWMPmaxPREQretries = nconf->dot11MeshHWMPmaxPREQretries; if (_chg_mesh_attr(NL80211_MESHCONF_PATH_REFRESH_TIME, mask)) conf->path_refresh_time = nconf->path_refresh_time; if (_chg_mesh_attr(NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, mask)) conf->min_discovery_timeout = nconf->min_discovery_timeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, mask)) conf->dot11MeshHWMPactivePathTimeout = nconf->dot11MeshHWMPactivePathTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, mask)) conf->dot11MeshHWMPpreqMinInterval = nconf->dot11MeshHWMPpreqMinInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, mask)) conf->dot11MeshHWMPperrMinInterval = nconf->dot11MeshHWMPperrMinInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, mask)) conf->dot11MeshHWMPnetDiameterTraversalTime = nconf->dot11MeshHWMPnetDiameterTraversalTime; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOTMODE, mask)) { conf->dot11MeshHWMPRootMode = nconf->dot11MeshHWMPRootMode; ieee80211_mesh_root_setup(ifmsh); } if (_chg_mesh_attr(NL80211_MESHCONF_GATE_ANNOUNCEMENTS, mask)) { /* our current gate announcement implementation rides on root * announcements, so require this ifmsh to also be a root node * */ if (nconf->dot11MeshGateAnnouncementProtocol && !(conf->dot11MeshHWMPRootMode > IEEE80211_ROOTMODE_ROOT)) { conf->dot11MeshHWMPRootMode = IEEE80211_PROACTIVE_RANN; ieee80211_mesh_root_setup(ifmsh); } conf->dot11MeshGateAnnouncementProtocol = nconf->dot11MeshGateAnnouncementProtocol; } if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_RANN_INTERVAL, mask)) conf->dot11MeshHWMPRannInterval = nconf->dot11MeshHWMPRannInterval; if (_chg_mesh_attr(NL80211_MESHCONF_FORWARDING, mask)) conf->dot11MeshForwarding = nconf->dot11MeshForwarding; if (_chg_mesh_attr(NL80211_MESHCONF_RSSI_THRESHOLD, mask)) { /* our RSSI threshold implementation is supported only for * devices that report signal in dBm. */ if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) return -EOPNOTSUPP; conf->rssi_threshold = nconf->rssi_threshold; } if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) { conf->ht_opmode = nconf->ht_opmode; sdata->vif.bss_conf.ht_operation_mode = nconf->ht_opmode; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_HT); } if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, mask)) conf->dot11MeshHWMPactivePathToRootTimeout = nconf->dot11MeshHWMPactivePathToRootTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOT_INTERVAL, mask)) conf->dot11MeshHWMProotInterval = nconf->dot11MeshHWMProotInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, mask)) conf->dot11MeshHWMPconfirmationInterval = nconf->dot11MeshHWMPconfirmationInterval; if (_chg_mesh_attr(NL80211_MESHCONF_POWER_MODE, mask)) { conf->power_mode = nconf->power_mode; ieee80211_mps_local_status_update(sdata); } if (_chg_mesh_attr(NL80211_MESHCONF_AWAKE_WINDOW, mask)) conf->dot11MeshAwakeWindowDuration = nconf->dot11MeshAwakeWindowDuration; if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask)) conf->plink_timeout = nconf->plink_timeout; if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) conf->dot11MeshNolearn = nconf->dot11MeshNolearn; if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) conf->dot11MeshConnectedToAuthServer = nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = setup->chandef }; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; int err; lockdep_assert_wiphy(sdata->local->hw.wiphy); memcpy(&ifmsh->mshcfg, conf, sizeof(struct mesh_config)); err = copy_mesh_setup(ifmsh, setup); if (err) return err; sdata->control_port_over_nl80211 = setup->control_port_over_nl80211; /* can mesh use other SMPS modes? */ sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = sdata->local->rx_chains; err = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; return ieee80211_start_mesh(sdata); } static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); ieee80211_stop_mesh(sdata); ieee80211_link_release_channel(&sdata->deflink); kfree(sdata->u.mesh.ie); return 0; } #endif static int ieee80211_change_bss(struct wiphy *wiphy, struct net_device *dev, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; u64 changed = 0; link = ieee80211_link_or_deflink(sdata, params->link_id, true); if (IS_ERR(link)) return PTR_ERR(link); if (!sdata_dereference(link->u.ap.beacon, sdata)) return -ENOENT; sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->basic_rates) { if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width, wiphy->bands[sband->band], params->basic_rates, params->basic_rates_len, &link->conf->basic_rates)) return -EINVAL; changed |= BSS_CHANGED_BASIC_RATES; ieee80211_check_rate_mask(link); } if (params->use_cts_prot >= 0) { link->conf->use_cts_prot = params->use_cts_prot; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (params->use_short_preamble >= 0) { link->conf->use_short_preamble = params->use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } if (!link->conf->use_short_slot && (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ)) { link->conf->use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } if (params->use_short_slot_time >= 0) { link->conf->use_short_slot = params->use_short_slot_time; changed |= BSS_CHANGED_ERP_SLOT; } if (params->ap_isolate >= 0) { if (params->ap_isolate) sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS; else sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS; ieee80211_check_fast_rx_iface(sdata); } if (params->ht_opmode >= 0) { link->conf->ht_operation_mode = (u16)params->ht_opmode; changed |= BSS_CHANGED_HT; } if (params->p2p_ctwindow >= 0) { link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_CTWINDOW_MASK; link->conf->p2p_noa_attr.oppps_ctwindow |= params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; changed |= BSS_CHANGED_P2P_PS; } if (params->p2p_opp_ps > 0) { link->conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } else if (params->p2p_opp_ps == 0) { link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } ieee80211_link_info_change_notify(sdata, link, changed); return 0; } static int ieee80211_set_txq_params(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_txq_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, params->link_id, true); struct ieee80211_tx_queue_params p; if (!local->ops->conf_tx) return -EOPNOTSUPP; if (local->hw.queues < IEEE80211_NUM_ACS) return -EOPNOTSUPP; if (IS_ERR(link)) return PTR_ERR(link); memset(&p, 0, sizeof(p)); p.aifs = params->aifs; p.cw_max = params->cwmax; p.cw_min = params->cwmin; p.txop = params->txop; /* * Setting tx queue params disables u-apsd because it's only * called in master mode. */ p.uapsd = false; ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac); link->tx_conf[params->ac] = p; if (drv_conf_tx(local, link, params->ac, &p)) { wiphy_debug(local->hw.wiphy, "failed to set TX queue parameters for AC %d\n", params->ac); return -EINVAL; } ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); return 0; } #ifdef CONFIG_PM static int ieee80211_suspend(struct wiphy *wiphy, struct cfg80211_wowlan *wowlan) { return __ieee80211_suspend(wiphy_priv(wiphy), wowlan); } static int ieee80211_resume(struct wiphy *wiphy) { return __ieee80211_resume(wiphy_priv(wiphy)); } #else #define ieee80211_suspend NULL #define ieee80211_resume NULL #endif static int ieee80211_scan(struct wiphy *wiphy, struct cfg80211_scan_request *req) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev); switch (ieee80211_vif_type_p2p(&sdata->vif)) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_DEVICE: break; case NL80211_IFTYPE_P2P_GO: if (sdata->local->ops->hw_scan) break; /* * FIXME: implement NoA while scanning in software, * for now fall through to allow scanning only when * beaconing hasn't been configured yet */ fallthrough; case NL80211_IFTYPE_AP: /* * If the scan has been forced (and the driver supports * forcing), don't care about being beaconing already. * This will create problems to the attached stations (e.g. all * the frames sent while scanning on other channel will be * lost) */ if (sdata->deflink.u.ap.beacon && (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || !(req->flags & NL80211_SCAN_FLAG_AP))) return -EOPNOTSUPP; break; case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } return ieee80211_request_scan(sdata, req); } static void ieee80211_abort_scan(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_scan_cancel(wiphy_priv(wiphy)); } static int ieee80211_sched_scan_start(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_sched_scan_request *req) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!sdata->local->ops->sched_scan_start) return -EOPNOTSUPP; return ieee80211_request_sched_scan_start(sdata, req); } static int ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev, u64 reqid) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; return ieee80211_request_sched_scan_stop(local); } static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_auth_request *req) { return ieee80211_mgd_auth(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_assoc_request *req) { return ieee80211_mgd_assoc(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_deauth(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_deauth_request *req) { return ieee80211_mgd_deauth(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_disassoc(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_disassoc_request *req) { return ieee80211_mgd_disassoc(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_join_ibss(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ibss_params *params) { return ieee80211_ibss_join(IEEE80211_DEV_TO_SUB_IF(dev), params); } static int ieee80211_leave_ibss(struct wiphy *wiphy, struct net_device *dev) { return ieee80211_ibss_leave(IEEE80211_DEV_TO_SUB_IF(dev)); } static int ieee80211_join_ocb(struct wiphy *wiphy, struct net_device *dev, struct ocb_setup *setup) { return ieee80211_ocb_join(IEEE80211_DEV_TO_SUB_IF(dev), setup); } static int ieee80211_leave_ocb(struct wiphy *wiphy, struct net_device *dev) { return ieee80211_ocb_leave(IEEE80211_DEV_TO_SUB_IF(dev)); } static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(sdata->vif.bss_conf.mcast_rate, rate, sizeof(int) * NUM_NL80211_BANDS); if (ieee80211_sdata_running(sdata)) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_MCAST_RATE); return 0; } static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) { struct ieee80211_local *local = wiphy_priv(wiphy); int err; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { ieee80211_check_fast_xmit_all(local); err = drv_set_frag_threshold(local, wiphy->frag_threshold); if (err) { ieee80211_check_fast_xmit_all(local); return err; } } if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || (changed & WIPHY_PARAM_DYN_ACK)) { s16 coverage_class; coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? wiphy->coverage_class : -1; err = drv_set_coverage_class(local, coverage_class); if (err) return err; } if (changed & WIPHY_PARAM_RTS_THRESHOLD) { err = drv_set_rts_threshold(local, wiphy->rts_threshold); if (err) return err; } if (changed & WIPHY_PARAM_RETRY_SHORT) { if (wiphy->retry_short > IEEE80211_MAX_TX_RETRY) return -EINVAL; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; } if (changed & WIPHY_PARAM_RETRY_LONG) { if (wiphy->retry_long > IEEE80211_MAX_TX_RETRY) return -EINVAL; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; } if (changed & (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); if (changed & (WIPHY_PARAM_TXQ_LIMIT | WIPHY_PARAM_TXQ_MEMORY_LIMIT | WIPHY_PARAM_TXQ_QUANTUM)) ieee80211_txq_set_params(local); return 0; } static int ieee80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; enum nl80211_tx_power_setting txp_type = type; bool update_txp_type = false; bool has_monitor = false; int user_power_level; int old_power = local->user_power_level; lockdep_assert_wiphy(local->hw.wiphy); switch (type) { case NL80211_TX_POWER_AUTOMATIC: user_power_level = IEEE80211_UNSET_POWER_LEVEL; txp_type = NL80211_TX_POWER_LIMITED; break; case NL80211_TX_POWER_LIMITED: case NL80211_TX_POWER_FIXED: if (mbm < 0 || (mbm % 100)) return -EOPNOTSUPP; user_power_level = MBM_TO_DBM(mbm); break; default: return -EINVAL; } if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return -EOPNOTSUPP; sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!sdata) return -EOPNOTSUPP; } for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; link->user_power_level = user_power_level; if (txp_type != link->conf->txpower_type) { update_txp_type = true; link->conf->txpower_type = txp_type; } ieee80211_recalc_txpower(link, update_txp_type); } return 0; } local->user_power_level = user_power_level; list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { has_monitor = true; continue; } for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; link->user_power_level = local->user_power_level; if (txp_type != link->conf->txpower_type) update_txp_type = true; link->conf->txpower_type = txp_type; } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; ieee80211_recalc_txpower(link, update_txp_type); } } if (has_monitor) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { sdata->deflink.user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; sdata->vif.bss_conf.txpower_type = txp_type; ieee80211_recalc_txpower(&sdata->deflink, update_txp_type); } } if (local->emulate_chanctx && (old_power != local->user_power_level)) ieee80211_hw_conf_chan(local); return 0; } static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, int *dbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (local->ops->get_txpower && (sdata->flags & IEEE80211_SDATA_IN_DRIVER)) return drv_get_txpower(local, sdata, dbm); if (local->emulate_chanctx) *dbm = local->hw.conf.power_level; else *dbm = sdata->vif.bss_conf.txpower; /* INT_MIN indicates no power level was set yet */ if (*dbm == INT_MIN) return -EINVAL; return 0; } static void ieee80211_rfkill_poll(struct wiphy *wiphy) { struct ieee80211_local *local = wiphy_priv(wiphy); drv_rfkill_poll(local); } #ifdef CONFIG_NL80211_TESTMODE static int ieee80211_testmode_cmd(struct wiphy *wiphy, struct wireless_dev *wdev, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_vif *vif = NULL; if (!local->ops->testmode_cmd) return -EOPNOTSUPP; if (wdev) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->flags & IEEE80211_SDATA_IN_DRIVER) vif = &sdata->vif; } return local->ops->testmode_cmd(&local->hw, vif, data, len); } static int ieee80211_testmode_dump(struct wiphy *wiphy, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->testmode_dump) return -EOPNOTSUPP; return local->ops->testmode_dump(&local->hw, skb, cb, data, len); } #endif int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, enum ieee80211_smps_mode smps_mode) { const u8 *ap; enum ieee80211_smps_mode old_req; int err; struct sta_info *sta; bool tdls_peer_found = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return -EINVAL; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return 0; old_req = link->u.mgd.req_smps; link->u.mgd.req_smps = smps_mode; /* The driver indicated that EML is enabled for the interface, which * implies that SMPS flows towards the AP should be stopped. */ if (sdata->vif.driver_flags & IEEE80211_VIF_EML_ACTIVE) return 0; if (old_req == smps_mode && smps_mode != IEEE80211_SMPS_AUTOMATIC) return 0; /* * If not associated, or current association is not an HT * association, there's no need to do anything, just store * the new value until we associate. */ if (!sdata->u.mgd.associated || link->conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT) return 0; ap = sdata->vif.cfg.ap_addr; rcu_read_lock(); list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) continue; tdls_peer_found = true; break; } rcu_read_unlock(); if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { if (tdls_peer_found || !sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_OFF; else smps_mode = IEEE80211_SMPS_DYNAMIC; } /* send SM PS frame to AP */ err = ieee80211_send_smps_action(sdata, smps_mode, ap, ap, ieee80211_vif_is_mld(&sdata->vif) ? link->link_id : -1); if (err) link->u.mgd.req_smps = old_req; else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found) ieee80211_teardown_tdls_peers(link); return err; } static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, bool enabled, int timeout) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); unsigned int link_id; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return -EOPNOTSUPP; if (enabled == sdata->u.mgd.powersave && timeout == local->dynamic_ps_forced_timeout) return 0; sdata->u.mgd.powersave = enabled; local->dynamic_ps_forced_timeout = timeout; /* no change, but if automatic follow powersave */ for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; __ieee80211_request_smps_mgd(sdata, link, link->u.mgd.req_smps); } if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); ieee80211_check_fast_rx_iface(sdata); return 0; } static void ieee80211_set_cqm_rssi_link(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, s32 rssi_thold, u32 rssi_hyst, s32 rssi_low, s32 rssi_high) { struct ieee80211_bss_conf *conf; if (!link || !link->conf) return; conf = link->conf; if (rssi_thold && rssi_hyst && rssi_thold == conf->cqm_rssi_thold && rssi_hyst == conf->cqm_rssi_hyst) return; conf->cqm_rssi_thold = rssi_thold; conf->cqm_rssi_hyst = rssi_hyst; conf->cqm_rssi_low = rssi_low; conf->cqm_rssi_high = rssi_high; link->u.mgd.last_cqm_event_signal = 0; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return; if (sdata->u.mgd.associated && (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_CQM); } static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; int link_id; if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER && !(vif->driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) return -EOPNOTSUPP; /* For MLD, handle CQM change on all the active links */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); ieee80211_set_cqm_rssi_link(sdata, link, rssi_thold, rssi_hyst, 0, 0); } return 0; } static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_low, s32 rssi_high) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; int link_id; if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER) return -EOPNOTSUPP; /* For MLD, handle CQM change on all the active links */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); ieee80211_set_cqm_rssi_link(sdata, link, 0, 0, rssi_low, rssi_high); } return 0; } static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id, const u8 *addr, const struct cfg80211_bitrate_mask *mask) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); int i, ret; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; /* * If active validate the setting and reject it if it doesn't leave * at least one basic rate usable, since we really have to be able * to send something, and if we're an AP we have to be able to do * so at a basic rate so that all clients can receive it. */ if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) && sdata->vif.bss_conf.chanreq.oper.chan) { u32 basic_rates = sdata->vif.bss_conf.basic_rates; enum nl80211_band band; band = sdata->vif.bss_conf.chanreq.oper.chan->band; if (!(mask->control[band].legacy & basic_rates)) return -EINVAL; } if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { ret = drv_set_bitrate_mask(local, sdata, mask); if (ret) return ret; } for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband = wiphy->bands[i]; int j; sdata->rc_rateidx_mask[i] = mask->control[i].legacy; memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs, sizeof(mask->control[i].ht_mcs)); memcpy(sdata->rc_rateidx_vht_mcs_mask[i], mask->control[i].vht_mcs, sizeof(mask->control[i].vht_mcs)); sdata->rc_has_mcs_mask[i] = false; sdata->rc_has_vht_mcs_mask[i] = false; if (!sband) continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { if (sdata->rc_rateidx_mcs_mask[i][j] != 0xff) { sdata->rc_has_mcs_mask[i] = true; break; } } for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { if (sdata->rc_rateidx_vht_mcs_mask[i][j] != 0xffff) { sdata->rc_has_vht_mcs_mask[i] = true; break; } } } return 0; } static int ieee80211_start_radar_detection(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms, int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = *chandef }; struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link_data; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!list_empty(&local->roc_list) || local->scanning) return -EBUSY; link_data = sdata_dereference(sdata->link[link_id], sdata); if (!link_data) return -ENOLINK; /* whatever, but channel contexts should not complain about that one */ link_data->smps_mode = IEEE80211_SMPS_OFF; link_data->needed_rx_chains = local->rx_chains; err = ieee80211_link_use_channel(link_data, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; wiphy_delayed_work_queue(wiphy, &link_data->dfs_cac_timer_work, msecs_to_jiffies(cac_time_ms)); return 0; } static void ieee80211_end_cac(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link_data; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { link_data = sdata_dereference(sdata->link[link_id], sdata); if (!link_data) continue; wiphy_delayed_work_cancel(wiphy, &link_data->dfs_cac_timer_work); if (sdata->wdev.links[link_id].cac_started) { ieee80211_link_release_channel(link_data); sdata->wdev.links[link_id].cac_started = false; } } } static struct cfg80211_beacon_data * cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) { struct cfg80211_beacon_data *new_beacon; u8 *pos; int len; len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + beacon->proberesp_ies_len + beacon->assocresp_ies_len + beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len; if (beacon->mbssid_ies) len += ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies, beacon->rnr_ies, beacon->mbssid_ies->cnt); new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); if (!new_beacon) return NULL; if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { new_beacon->mbssid_ies = kzalloc(struct_size(new_beacon->mbssid_ies, elem, beacon->mbssid_ies->cnt), GFP_KERNEL); if (!new_beacon->mbssid_ies) { kfree(new_beacon); return NULL; } if (beacon->rnr_ies && beacon->rnr_ies->cnt) { new_beacon->rnr_ies = kzalloc(struct_size(new_beacon->rnr_ies, elem, beacon->rnr_ies->cnt), GFP_KERNEL); if (!new_beacon->rnr_ies) { kfree(new_beacon->mbssid_ies); kfree(new_beacon); return NULL; } } } pos = (u8 *)(new_beacon + 1); if (beacon->head_len) { new_beacon->head_len = beacon->head_len; new_beacon->head = pos; memcpy(pos, beacon->head, beacon->head_len); pos += beacon->head_len; } if (beacon->tail_len) { new_beacon->tail_len = beacon->tail_len; new_beacon->tail = pos; memcpy(pos, beacon->tail, beacon->tail_len); pos += beacon->tail_len; } if (beacon->beacon_ies_len) { new_beacon->beacon_ies_len = beacon->beacon_ies_len; new_beacon->beacon_ies = pos; memcpy(pos, beacon->beacon_ies, beacon->beacon_ies_len); pos += beacon->beacon_ies_len; } if (beacon->proberesp_ies_len) { new_beacon->proberesp_ies_len = beacon->proberesp_ies_len; new_beacon->proberesp_ies = pos; memcpy(pos, beacon->proberesp_ies, beacon->proberesp_ies_len); pos += beacon->proberesp_ies_len; } if (beacon->assocresp_ies_len) { new_beacon->assocresp_ies_len = beacon->assocresp_ies_len; new_beacon->assocresp_ies = pos; memcpy(pos, beacon->assocresp_ies, beacon->assocresp_ies_len); pos += beacon->assocresp_ies_len; } if (beacon->probe_resp_len) { new_beacon->probe_resp_len = beacon->probe_resp_len; new_beacon->probe_resp = pos; memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { pos += ieee80211_copy_mbssid_beacon(pos, new_beacon->mbssid_ies, beacon->mbssid_ies); if (beacon->rnr_ies && beacon->rnr_ies->cnt) pos += ieee80211_copy_rnr_beacon(pos, new_beacon->rnr_ies, beacon->rnr_ies); } /* might copy -1, meaning no changes requested */ new_beacon->ftm_responder = beacon->ftm_responder; if (beacon->lci) { new_beacon->lci_len = beacon->lci_len; new_beacon->lci = pos; memcpy(pos, beacon->lci, beacon->lci_len); pos += beacon->lci_len; } if (beacon->civicloc) { new_beacon->civicloc_len = beacon->civicloc_len; new_beacon->civicloc = pos; memcpy(pos, beacon->civicloc, beacon->civicloc_len); pos += beacon->civicloc_len; } return new_beacon; } void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link_data; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link_data = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link_data)) { rcu_read_unlock(); return; } /* TODO: MBSSID with MLO changes */ if (vif->mbssid_tx_vif == vif) { /* Trigger ieee80211_csa_finish() on the non-transmitting * interfaces when channel switch is received on * transmitting interface */ struct ieee80211_sub_if_data *iter; list_for_each_entry_rcu(iter, &local->interfaces, list) { if (!ieee80211_sdata_running(iter)) continue; if (iter == sdata || iter->vif.mbssid_tx_vif != vif) continue; wiphy_work_queue(iter->local->hw.wiphy, &iter->deflink.csa.finalize_work); } } wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_csa_finish); void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; sdata_info(sdata, "channel switch failed, disconnecting\n"); wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work); } EXPORT_SYMBOL(ieee80211_channel_switch_disconnect); static int ieee80211_set_after_csa_beacon(struct ieee80211_link_data *link_data, u64 *changed) { struct ieee80211_sub_if_data *sdata = link_data->sdata; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: if (!link_data->u.ap.next_beacon) return -EINVAL; err = ieee80211_assign_beacon(sdata, link_data, link_data->u.ap.next_beacon, NULL, NULL, changed); ieee80211_free_next_beacon(link_data); if (err < 0) return err; break; case NL80211_IFTYPE_ADHOC: err = ieee80211_ibss_finish_csa(sdata, changed); if (err < 0) return err; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: err = ieee80211_mesh_finish_csa(sdata, changed); if (err < 0) return err; break; #endif default: WARN_ON(1); return -EINVAL; } return 0; } static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss_conf *link_conf = link_data->conf; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the * work with no reserved_chanctx so verify chandef to check if it * completed successfully */ if (link_data->reserved_chanctx) { /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their * reservations */ if (link_data->reserved_ready) return 0; return ieee80211_link_use_reserved_context(link_data); } if (!cfg80211_chandef_identical(&link_conf->chanreq.oper, &link_data->csa.chanreq.oper)) return -EINVAL; link_conf->csa_active = false; err = ieee80211_set_after_csa_beacon(link_data, &changed); if (err) return err; ieee80211_link_info_change_notify(sdata, link_data, changed); ieee80211_vif_unblock_queues_csa(sdata); err = drv_post_channel_switch(link_data); if (err) return err; cfg80211_ch_switch_notify(sdata->dev, &link_data->csa.chanreq.oper, link_data->link_id); return 0; } static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; if (__ieee80211_csa_finalize(link_data)) { sdata_info(sdata, "failed to finalize CSA on link %d, disconnecting\n", link_data->link_id); cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev, GFP_KERNEL); } } void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, csa.finalize_work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); /* AP might have been stopped while waiting for the lock. */ if (!link->conf->csa_active) return; if (!ieee80211_sdata_running(sdata)) return; ieee80211_csa_finalize(link); } static int ieee80211_set_csa_beacon(struct ieee80211_link_data *link_data, struct cfg80211_csa_settings *params, u64 *changed) { struct ieee80211_sub_if_data *sdata = link_data->sdata; struct ieee80211_csa_settings csa = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: link_data->u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_after); if (!link_data->u.ap.next_beacon) return -ENOMEM; /* * With a count of 0, we don't have to wait for any * TBTT before switching, so complete the CSA * immediately. In theory, with a count == 1 we * should delay the switch until just before the next * TBTT, but that would complicate things so we switch * immediately too. If we would delay the switch * until the next TBTT, we would have to set the probe * response here. * * TODO: A channel switch with count <= 1 without * sending a CSA action frame is kind of useless, * because the clients won't know we're changing * channels. The action frame must be implemented * either here or in the userspace. */ if (params->count <= 1) break; if ((params->n_counter_offsets_beacon > IEEE80211_MAX_CNTDWN_COUNTERS_NUM) || (params->n_counter_offsets_presp > IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) { ieee80211_free_next_beacon(link_data); return -EINVAL; } csa.counter_offsets_beacon = params->counter_offsets_beacon; csa.counter_offsets_presp = params->counter_offsets_presp; csa.n_counter_offsets_beacon = params->n_counter_offsets_beacon; csa.n_counter_offsets_presp = params->n_counter_offsets_presp; csa.count = params->count; err = ieee80211_assign_beacon(sdata, link_data, &params->beacon_csa, &csa, NULL, changed); if (err < 0) { ieee80211_free_next_beacon(link_data); return err; } break; case NL80211_IFTYPE_ADHOC: if (!sdata->vif.cfg.ibss_joined) return -EINVAL; if (params->chandef.width != sdata->u.ibss.chandef.width) return -EINVAL; switch (params->chandef.width) { case NL80211_CHAN_WIDTH_40: if (cfg80211_get_chandef_type(&params->chandef) != cfg80211_get_chandef_type(&sdata->u.ibss.chandef)) return -EINVAL; break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: break; default: return -EINVAL; } /* changes into another band are not supported */ if (sdata->u.ibss.chandef.chan->band != params->chandef.chan->band) return -EINVAL; /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { err = ieee80211_ibss_csa_beacon(sdata, params, changed); if (err < 0) return err; } ieee80211_send_action_csa(sdata, params); break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; /* changes into another band are not supported */ if (sdata->vif.bss_conf.chanreq.oper.chan->band != params->chandef.chan->band) return -EINVAL; if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_NONE) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_INIT; if (!ifmsh->pre_value) ifmsh->pre_value = 1; else ifmsh->pre_value++; } /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { err = ieee80211_mesh_csa_beacon(sdata, params, changed); if (err < 0) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; return err; } } if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT) ieee80211_send_action_csa(sdata, params); break; } #endif default: return -EOPNOTSUPP; } return 0; } static void ieee80211_color_change_abort(struct ieee80211_link_data *link) { link->conf->color_change_active = false; ieee80211_free_next_beacon(link); cfg80211_color_change_aborted_notify(link->sdata->dev, link->link_id); } static int __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = params->chandef }; struct ieee80211_local *local = sdata->local; struct ieee80211_channel_switch ch_switch = { .link_id = params->link_id, }; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; struct ieee80211_bss_conf *link_conf; struct ieee80211_link_data *link_data; u64 changed = 0; u8 link_id = params->link_id; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!list_empty(&local->roc_list) || local->scanning) return -EBUSY; if (sdata->wdev.links[link_id].cac_started) return -EBUSY; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return -EINVAL; link_data = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link_data) return -ENOLINK; link_conf = link_data->conf; if (chanreq.oper.punctured && !link_conf->eht_support) return -EINVAL; /* don't allow another channel switch if one is already active. */ if (link_conf->csa_active) return -EBUSY; conf = wiphy_dereference(wiphy, link_conf->chanctx_conf); if (!conf) { err = -EBUSY; goto out; } if (params->chandef.chan->freq_offset) { /* this may work, but is untested */ err = -EOPNOTSUPP; goto out; } chanctx = container_of(conf, struct ieee80211_chanctx, conf); ch_switch.timestamp = 0; ch_switch.device_timestamp = 0; ch_switch.block_tx = params->block_tx; ch_switch.chandef = chanreq.oper; ch_switch.count = params->count; err = drv_pre_channel_switch(sdata, &ch_switch); if (err) goto out; err = ieee80211_link_reserve_chanctx(link_data, &chanreq, chanctx->mode, params->radar_required); if (err) goto out; /* if reservation is invalid then this will fail */ err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0, -1); if (err) { ieee80211_link_unreserve_chanctx(link_data); goto out; } /* if there is a color change in progress, abort it */ if (link_conf->color_change_active) ieee80211_color_change_abort(link_data); err = ieee80211_set_csa_beacon(link_data, params, &changed); if (err) { ieee80211_link_unreserve_chanctx(link_data); goto out; } link_data->csa.chanreq = chanreq; link_conf->csa_active = true; if (params->block_tx) ieee80211_vif_block_queues_csa(sdata); cfg80211_ch_switch_started_notify(sdata->dev, &link_data->csa.chanreq.oper, link_id, params->count, params->block_tx); if (changed) { ieee80211_link_info_change_notify(sdata, link_data, changed); drv_channel_switch_beacon(sdata, &link_data->csa.chanreq.oper); } else { /* if the beacon didn't change, we can finalize immediately */ ieee80211_csa_finalize(link_data); } out: return err; } int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); return __ieee80211_channel_switch(wiphy, dev, params); } u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); local->roc_cookie_counter++; /* wow, you wrapped 64 bits ... more likely a bug */ if (WARN_ON(local->roc_cookie_counter == 0)) local->roc_cookie_counter++; return local->roc_cookie_counter; } int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp) { unsigned long spin_flags; struct sk_buff *ack_skb; int id; ack_skb = skb_copy(skb, gfp); if (!ack_skb) return -ENOMEM; spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { kfree_skb(ack_skb); return -ENOMEM; } IEEE80211_SKB_CB(skb)->status_data_idr = 1; IEEE80211_SKB_CB(skb)->status_data = id; *cookie = ieee80211_mgmt_tx_cookie(local); IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; return 0; } static void ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4); bool global_change, intf_change; global_change = (local->probe_req_reg != !!(upd->global_stypes & preq_mask)) || (local->rx_mcast_action_reg != !!(upd->global_mcast_stypes & action_mask)); local->probe_req_reg = upd->global_stypes & preq_mask; local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask; intf_change = (sdata->vif.probe_req_reg != !!(upd->interface_stypes & preq_mask)) || (sdata->vif.rx_mcast_action_reg != !!(upd->interface_mcast_stypes & action_mask)); sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; sdata->vif.rx_mcast_action_reg = upd->interface_mcast_stypes & action_mask; if (!local->open_count) return; if (intf_change && ieee80211_sdata_running(sdata)) drv_config_iface_filter(local, sdata, sdata->vif.probe_req_reg ? FIF_PROBE_REQ : 0, FIF_PROBE_REQ); if (global_change) ieee80211_configure_filter(local); } static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); int ret; if (local->started) return -EOPNOTSUPP; ret = drv_set_antenna(local, tx_ant, rx_ant); if (ret) return ret; local->rx_chains = hweight8(rx_ant); return 0; } static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); return drv_get_antenna(local, tx_ant, rx_ant); } static int ieee80211_set_rekey_data(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!local->ops->set_rekey_data) return -EOPNOTSUPP; drv_set_rekey_data(local, sdata, data); return 0; } static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos; struct ieee80211_tx_info *info; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; int ret; /* the lock is needed to assign the cookie later */ lockdep_assert_wiphy(local->hw.wiphy); rcu_read_lock(); sta = sta_info_get_bss(sdata, peer); if (!sta) { ret = -ENOLINK; goto unlock; } qos = sta->sta.wme; chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { ret = -EINVAL; goto unlock; } band = chanctx_conf->def.chan->band; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) { ret = -ENOMEM; goto unlock; } skb->dev = dev; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_NL80211_FRAME_TX; info->band = band; skb_set_queue_mapping(skb, IEEE80211_AC_VO); skb->priority = 7; if (qos) nullfunc->qos_ctrl = cpu_to_le16(7); ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_ATOMIC); if (ret) { kfree_skb(skb); goto unlock; } local_bh_disable(); ieee80211_xmit(sdata, sta, skb); local_bh_enable(); ret = 0; unlock: rcu_read_unlock(); return ret; } static int ieee80211_cfg_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_link_data *link; int ret = -ENODATA; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (!link) { ret = -ENOLINK; goto out; } chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (chanctx_conf) { *chandef = link->conf->chanreq.oper; ret = 0; } else if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && local->open_count > 0 && local->open_count == local->monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) { *chandef = local->monitor_chanreq.oper; ret = 0; } out: rcu_read_unlock(); return ret; } #ifdef CONFIG_PM static void ieee80211_set_wakeup(struct wiphy *wiphy, bool enabled) { drv_set_wakeup(wiphy_priv(wiphy), enabled); } #endif static int ieee80211_set_qos_map(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_qos_map *qos_map) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct mac80211_qos_map *new_qos_map, *old_qos_map; if (qos_map) { new_qos_map = kzalloc(sizeof(*new_qos_map), GFP_KERNEL); if (!new_qos_map) return -ENOMEM; memcpy(&new_qos_map->qos_map, qos_map, sizeof(*qos_map)); } else { /* A NULL qos_map was passed to disable QoS mapping */ new_qos_map = NULL; } old_qos_map = sdata_dereference(sdata->qos_map, sdata); rcu_assign_pointer(sdata->qos_map, new_qos_map); if (old_qos_map) kfree_rcu(old_qos_map, rcu_head); return 0; } static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct ieee80211_chan_req chanreq = { .oper = *chandef }; int ret; u64 changed = 0; link = sdata_dereference(sdata->link[link_id], sdata); ret = ieee80211_link_change_chanreq(link, &chanreq, &changed); if (ret == 0) ieee80211_link_info_change_notify(sdata, link, changed); return ret; } static int ieee80211_add_tx_ts(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer, u8 up, u16 admitted_time) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ac = ieee802_1d_to_ac[up]; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!(sdata->wmm_acm & BIT(up))) return -EINVAL; if (ifmgd->tx_tspec[ac].admitted_time) return -EBUSY; if (admitted_time) { ifmgd->tx_tspec[ac].admitted_time = 32 * admitted_time; ifmgd->tx_tspec[ac].tsid = tsid; ifmgd->tx_tspec[ac].up = up; } return 0; } static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = wiphy_priv(wiphy); int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; /* skip unused entries */ if (!tx_tspec->admitted_time) continue; if (tx_tspec->tsid != tsid) continue; /* due to this new packets will be reassigned to non-ACM ACs */ tx_tspec->up = -1; /* Make sure that all packets have been sent to avoid to * restore the QoS params on packets that are still on the * queues. */ synchronize_net(); ieee80211_flush_queues(local, sdata, false); /* restore the normal QoS parameters * (unconditionally to avoid races) */ tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; tx_tspec->downgraded = false; ieee80211_sta_handle_tspec_ac_params(sdata); /* finally clear all the data */ memset(tx_tspec, 0, sizeof(*tx_tspec)); return 0; } return -ENOENT; } void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, u8 inst_id, enum nl80211_nan_func_term_reason reason, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct cfg80211_nan_func *func; u64 cookie; if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = idr_find(&sdata->u.nan.function_inst_ids, inst_id); if (WARN_ON(!func)) { spin_unlock_bh(&sdata->u.nan.func_lock); return; } cookie = func->cookie; idr_remove(&sdata->u.nan.function_inst_ids, inst_id); spin_unlock_bh(&sdata->u.nan.func_lock); cfg80211_free_nan_func(func); cfg80211_nan_func_terminated(ieee80211_vif_to_wdev(vif), inst_id, reason, cookie, gfp); } EXPORT_SYMBOL(ieee80211_nan_func_terminated); void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct cfg80211_nan_func *func; if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = idr_find(&sdata->u.nan.function_inst_ids, match->inst_id); if (WARN_ON(!func)) { spin_unlock_bh(&sdata->u.nan.func_lock); return; } match->cookie = func->cookie; spin_unlock_bh(&sdata->u.nan.func_lock); cfg80211_nan_match(ieee80211_vif_to_wdev(vif), match, gfp); } EXPORT_SYMBOL(ieee80211_nan_func_match); static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy, struct net_device *dev, const bool enabled) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->u.ap.multicast_to_unicast = enabled; return 0; } void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi) { if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_BYTES))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_BYTES); txqstats->backlog_bytes = txqi->tin.backlog_bytes; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS); txqstats->backlog_packets = txqi->tin.backlog_packets; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_FLOWS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_FLOWS); txqstats->flows = txqi->tin.flows; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_DROPS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_DROPS); txqstats->drops = txqi->cstats.drop_count; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_ECN_MARKS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_ECN_MARKS); txqstats->ecn_marks = txqi->cstats.ecn_mark; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_OVERLIMIT))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_OVERLIMIT); txqstats->overlimit = txqi->tin.overlimit; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_COLLISIONS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_COLLISIONS); txqstats->collisions = txqi->tin.collisions; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_BYTES))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_BYTES); txqstats->tx_bytes = txqi->tin.tx_bytes; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_PACKETS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_PACKETS); txqstats->tx_packets = txqi->tin.tx_packets; } } static int ieee80211_get_txq_stats(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; int ret = 0; spin_lock_bh(&local->fq.lock); rcu_read_lock(); if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!sdata->vif.txq) { ret = 1; goto out; } ieee80211_fill_txq_stats(txqstats, to_txq_info(sdata->vif.txq)); } else { /* phy stats */ txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS) | BIT(NL80211_TXQ_STATS_BACKLOG_BYTES) | BIT(NL80211_TXQ_STATS_OVERLIMIT) | BIT(NL80211_TXQ_STATS_OVERMEMORY) | BIT(NL80211_TXQ_STATS_COLLISIONS) | BIT(NL80211_TXQ_STATS_MAX_FLOWS); txqstats->backlog_packets = local->fq.backlog; txqstats->backlog_bytes = local->fq.memory_usage; txqstats->overlimit = local->fq.overlimit; txqstats->overmemory = local->fq.overmemory; txqstats->collisions = local->fq.collisions; txqstats->max_flows = local->fq.flows_cnt; } out: rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return ret; } static int ieee80211_get_ftm_responder_stats(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return drv_get_ftm_responder_stats(local, sdata, ftm_stats); } static int ieee80211_start_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, struct cfg80211_pmsr_request *request) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); return drv_start_pmsr(local, sdata, request); } static void ieee80211_abort_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, struct cfg80211_pmsr_request *request) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); return drv_abort_pmsr(local, sdata, request); } static int ieee80211_set_tid_config(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!sdata->local->ops->set_tid_config) return -EOPNOTSUPP; if (!tid_conf->peer) return drv_set_tid_config(sdata->local, sdata, NULL, tid_conf); sta = sta_info_get_bss(sdata, tid_conf->peer); if (!sta) return -ENOENT; return drv_set_tid_config(sdata->local, sdata, &sta->sta, tid_conf); } static int ieee80211_reset_tid_config(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u8 tids) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!sdata->local->ops->reset_tid_config) return -EOPNOTSUPP; if (!peer) return drv_reset_tid_config(sdata->local, sdata, NULL, tids); sta = sta_info_get_bss(sdata, peer); if (!sta) return -ENOENT; return drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids); } static int ieee80211_set_sar_specs(struct wiphy *wiphy, struct cfg80211_sar_specs *sar) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->set_sar_specs) return -EOPNOTSUPP; return local->ops->set_sar_specs(&local->hw, sar); } static int ieee80211_set_after_color_change_beacon(struct ieee80211_link_data *link, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: { int ret; if (!link->u.ap.next_beacon) return -EINVAL; ret = ieee80211_assign_beacon(sdata, link, link->u.ap.next_beacon, NULL, NULL, changed); ieee80211_free_next_beacon(link); if (ret < 0) return ret; break; } default: WARN_ON_ONCE(1); return -EINVAL; } return 0; } static int ieee80211_set_color_change_beacon(struct ieee80211_link_data *link, struct cfg80211_color_change_settings *params, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_color_change_settings color_change = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: link->u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_next); if (!link->u.ap.next_beacon) return -ENOMEM; if (params->count <= 1) break; color_change.counter_offset_beacon = params->counter_offset_beacon; color_change.counter_offset_presp = params->counter_offset_presp; color_change.count = params->count; err = ieee80211_assign_beacon(sdata, link, &params->beacon_color_change, NULL, &color_change, changed); if (err < 0) { ieee80211_free_next_beacon(link); return err; } break; default: return -EOPNOTSUPP; } return 0; } static void ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link, u8 color, int enable, u64 changed) { struct ieee80211_sub_if_data *sdata = link->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); link->conf->he_bss_color.color = color; link->conf->he_bss_color.enabled = enable; changed |= BSS_CHANGED_HE_BSS_COLOR; ieee80211_link_info_change_notify(sdata, link, changed); if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) { struct ieee80211_sub_if_data *child; list_for_each_entry(child, &sdata->local->interfaces, list) { if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) { child->vif.bss_conf.he_bss_color.color = color; child->vif.bss_conf.he_bss_color.enabled = enable; ieee80211_link_info_change_notify(child, &child->deflink, BSS_CHANGED_HE_BSS_COLOR); } } } } static int ieee80211_color_change_finalize(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); link->conf->color_change_active = false; err = ieee80211_set_after_color_change_beacon(link, &changed); if (err) { cfg80211_color_change_aborted_notify(sdata->dev, link->link_id); return err; } ieee80211_color_change_bss_config_notify(link, link->conf->color_change_color, 1, changed); cfg80211_color_change_notify(sdata->dev, link->link_id); return 0; } void ieee80211_color_change_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, color_change_finalize_work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); /* AP might have been stopped while waiting for the lock. */ if (!link_conf->color_change_active) return; if (!ieee80211_sdata_running(sdata)) return; ieee80211_color_change_finalize(link); } void ieee80211_color_collision_detection_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, color_collision_detect_work.work); struct ieee80211_sub_if_data *sdata = link->sdata; cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap, link->link_id); } void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } wiphy_work_queue(sdata->local->hw.wiphy, &link->color_change_finalize_work); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } if (link->conf->color_change_active || link->conf->csa_active) { rcu_read_unlock(); return; } if (wiphy_delayed_work_pending(sdata->local->hw.wiphy, &link->color_collision_detect_work)) { rcu_read_unlock(); return; } link->color_bitmap = color_bitmap; /* queue the color collision detection event every 500 ms in order to * avoid sending too much netlink messages to userspace. */ wiphy_delayed_work_queue(sdata->local->hw.wiphy, &link->color_collision_detect_work, msecs_to_jiffies(500)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify); static int ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_color_change_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_bss_conf *link_conf; struct ieee80211_link_data *link; u8 link_id = params->link_id; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return -EINVAL; link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) return -ENOLINK; link_conf = link->conf; if (link_conf->nontransmitted) return -EINVAL; /* don't allow another color change if one is already active or if csa * is active */ if (link_conf->color_change_active || link_conf->csa_active) { err = -EBUSY; goto out; } err = ieee80211_set_color_change_beacon(link, params, &changed); if (err) goto out; link_conf->color_change_active = true; link_conf->color_change_color = params->color; cfg80211_color_change_started_notify(sdata->dev, params->count, link_id); if (changed) ieee80211_color_change_bss_config_notify(link, 0, 0, changed); else /* if the beacon didn't change, we can finalize immediately */ ieee80211_color_change_finalize(link); out: return err; } static int ieee80211_set_radar_background(struct wiphy *wiphy, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->set_radar_background) return -EOPNOTSUPP; return local->ops->set_radar_background(&local->hw, chandef); } static int ieee80211_add_intf_link(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); lockdep_assert_wiphy(sdata->local->hw.wiphy); if (wdev->use_4addr) return -EOPNOTSUPP; return ieee80211_vif_set_links(sdata, wdev->valid_links, 0); } static void ieee80211_del_intf_link(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u16 new_links = wdev->valid_links & ~BIT(link_id); lockdep_assert_wiphy(sdata->local->hw.wiphy); /* During the link teardown process, certain functions require the * link_id to remain in the valid_links bitmap. Therefore, instead * of removing the link_id from the bitmap, pass a masked value to * simulate as if link_id does not exist anymore. */ ieee80211_vif_set_links(sdata, new_links, 0); } static int ieee80211_add_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; int ret; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!sta->sta.valid_links) return -EINVAL; if (sta->sta.valid_links & BIT(params->link_id)) return -EALREADY; ret = ieee80211_sta_allocate_link(sta, params->link_id); if (ret) return ret; ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_NEW, params); if (ret) { ieee80211_sta_free_link(sta, params->link_id); return ret; } if (test_sta_flag(sta, WLAN_STA_ASSOC)) { struct link_sta_info *link_sta; link_sta = sdata_dereference(sta->link[params->link_id], sdata); rate_control_rate_init(link_sta); } /* ieee80211_sta_activate_link frees the link upon failure */ return ieee80211_sta_activate_link(sta, params->link_id); } static int ieee80211_mod_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!(sta->sta.valid_links & BIT(params->link_id))) return -EINVAL; return sta_link_apply_parameters(local, sta, STA_LINK_MODE_LINK_MODIFY, params); } static int ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_del_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!(sta->sta.valid_links & BIT(params->link_id))) return -EINVAL; /* must not create a STA without links */ if (sta->sta.valid_links == BIT(params->link_id)) return -EINVAL; ieee80211_sta_remove_link(sta, params->link_id); return 0; } static int ieee80211_set_hw_timestamp(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; if (!local->ops->set_hw_timestamp) return -EOPNOTSUPP; if (!check_sdata_in_driver(sdata)) return -EIO; return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts); } static int ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ttlm_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); return ieee80211_req_neg_ttlm(sdata, params); } const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, .change_virtual_intf = ieee80211_change_iface, .start_p2p_device = ieee80211_start_p2p_device, .stop_p2p_device = ieee80211_stop_p2p_device, .add_key = ieee80211_add_key, .del_key = ieee80211_del_key, .get_key = ieee80211_get_key, .set_default_key = ieee80211_config_default_key, .set_default_mgmt_key = ieee80211_config_default_mgmt_key, .set_default_beacon_key = ieee80211_config_default_beacon_key, .start_ap = ieee80211_start_ap, .change_beacon = ieee80211_change_beacon, .stop_ap = ieee80211_stop_ap, .add_station = ieee80211_add_station, .del_station = ieee80211_del_station, .change_station = ieee80211_change_station, .get_station = ieee80211_get_station, .dump_station = ieee80211_dump_station, .dump_survey = ieee80211_dump_survey, #ifdef CONFIG_MAC80211_MESH .add_mpath = ieee80211_add_mpath, .del_mpath = ieee80211_del_mpath, .change_mpath = ieee80211_change_mpath, .get_mpath = ieee80211_get_mpath, .dump_mpath = ieee80211_dump_mpath, .get_mpp = ieee80211_get_mpp, .dump_mpp = ieee80211_dump_mpp, .update_mesh_config = ieee80211_update_mesh_config, .get_mesh_config = ieee80211_get_mesh_config, .join_mesh = ieee80211_join_mesh, .leave_mesh = ieee80211_leave_mesh, #endif .join_ocb = ieee80211_join_ocb, .leave_ocb = ieee80211_leave_ocb, .change_bss = ieee80211_change_bss, .inform_bss = ieee80211_inform_bss, .set_txq_params = ieee80211_set_txq_params, .set_monitor_channel = ieee80211_set_monitor_channel, .suspend = ieee80211_suspend, .resume = ieee80211_resume, .scan = ieee80211_scan, .abort_scan = ieee80211_abort_scan, .sched_scan_start = ieee80211_sched_scan_start, .sched_scan_stop = ieee80211_sched_scan_stop, .auth = ieee80211_auth, .assoc = ieee80211_assoc, .deauth = ieee80211_deauth, .disassoc = ieee80211_disassoc, .join_ibss = ieee80211_join_ibss, .leave_ibss = ieee80211_leave_ibss, .set_mcast_rate = ieee80211_set_mcast_rate, .set_wiphy_params = ieee80211_set_wiphy_params, .set_tx_power = ieee80211_set_tx_power, .get_tx_power = ieee80211_get_tx_power, .rfkill_poll = ieee80211_rfkill_poll, CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd) CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump) .set_power_mgmt = ieee80211_set_power_mgmt, .set_bitrate_mask = ieee80211_set_bitrate_mask, .remain_on_channel = ieee80211_remain_on_channel, .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, .mgmt_tx = ieee80211_mgmt_tx, .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, .set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config, .update_mgmt_frame_registrations = ieee80211_update_mgmt_frame_registrations, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, .set_rekey_data = ieee80211_set_rekey_data, .tdls_oper = ieee80211_tdls_oper, .tdls_mgmt = ieee80211_tdls_mgmt, .tdls_channel_switch = ieee80211_tdls_channel_switch, .tdls_cancel_channel_switch = ieee80211_tdls_cancel_channel_switch, .probe_client = ieee80211_probe_client, .set_noack_map = ieee80211_set_noack_map, #ifdef CONFIG_PM .set_wakeup = ieee80211_set_wakeup, #endif .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, .end_cac = ieee80211_end_cac, .channel_switch = ieee80211_channel_switch, .set_qos_map = ieee80211_set_qos_map, .set_ap_chanwidth = ieee80211_set_ap_chanwidth, .add_tx_ts = ieee80211_add_tx_ts, .del_tx_ts = ieee80211_del_tx_ts, .start_nan = ieee80211_start_nan, .stop_nan = ieee80211_stop_nan, .nan_change_conf = ieee80211_nan_change_conf, .add_nan_func = ieee80211_add_nan_func, .del_nan_func = ieee80211_del_nan_func, .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, .tx_control_port = ieee80211_tx_control_port, .get_txq_stats = ieee80211_get_txq_stats, .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats, .start_pmsr = ieee80211_start_pmsr, .abort_pmsr = ieee80211_abort_pmsr, .probe_mesh_link = ieee80211_probe_mesh_link, .set_tid_config = ieee80211_set_tid_config, .reset_tid_config = ieee80211_reset_tid_config, .set_sar_specs = ieee80211_set_sar_specs, .color_change = ieee80211_color_change, .set_radar_background = ieee80211_set_radar_background, .add_intf_link = ieee80211_add_intf_link, .del_intf_link = ieee80211_del_intf_link, .add_link_station = ieee80211_add_link_station, .mod_link_station = ieee80211_mod_link_station, .del_link_station = ieee80211_del_link_station, .set_hw_timestamp = ieee80211_set_hw_timestamp, .set_ttlm = ieee80211_set_ttlm, .get_radio_mask = ieee80211_get_radio_mask, };
12 6 11 3 3 11 11 11 11 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0-or-later /* * Create default crypto algorithm instances. * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <linux/completion.h> #include <linux/ctype.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" struct cryptomgr_param { struct rtattr *tb[CRYPTO_MAX_ATTRS + 2]; struct { struct rtattr attr; struct crypto_attr_type data; } type; struct { struct rtattr attr; struct crypto_attr_alg data; } attrs[CRYPTO_MAX_ATTRS]; char template[CRYPTO_MAX_ALG_NAME]; struct crypto_larval *larval; u32 otype; u32 omask; }; struct crypto_test_param { char driver[CRYPTO_MAX_ALG_NAME]; char alg[CRYPTO_MAX_ALG_NAME]; u32 type; }; static int cryptomgr_probe(void *data) { struct cryptomgr_param *param = data; struct crypto_template *tmpl; int err = -ENOENT; tmpl = crypto_lookup_template(param->template); if (!tmpl) goto out; do { err = tmpl->create(tmpl, param->tb); } while (err == -EAGAIN && !signal_pending(current)); crypto_tmpl_put(tmpl); out: param->larval->adult = ERR_PTR(err); param->larval->alg.cra_flags |= CRYPTO_ALG_DEAD; complete_all(&param->larval->completion); crypto_alg_put(&param->larval->alg); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_probe(struct crypto_larval *larval) { struct task_struct *thread; struct cryptomgr_param *param; const char *name = larval->alg.cra_name; const char *p; unsigned int len; int i; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; for (p = name; isalnum(*p) || *p == '-' || *p == '_'; p++) ; len = p - name; if (!len || *p != '(') goto err_free_param; memcpy(param->template, name, len); i = 0; for (;;) { name = ++p; for (; isalnum(*p) || *p == '-' || *p == '_'; p++) ; if (*p == '(') { int recursion = 0; for (;;) { if (!*++p) goto err_free_param; if (*p == '(') recursion++; else if (*p == ')' && !recursion--) break; } p++; } len = p - name; if (!len) goto err_free_param; param->attrs[i].attr.rta_len = sizeof(param->attrs[i]); param->attrs[i].attr.rta_type = CRYPTOA_ALG; memcpy(param->attrs[i].data.name, name, len); param->tb[i + 1] = &param->attrs[i].attr; i++; if (i >= CRYPTO_MAX_ATTRS) goto err_free_param; if (*p == ')') break; if (*p != ',') goto err_free_param; } param->tb[i + 1] = NULL; param->type.attr.rta_len = sizeof(param->type); param->type.attr.rta_type = CRYPTOA_TYPE; param->type.data.type = larval->alg.cra_flags & ~CRYPTO_ALG_TESTED; param->type.data.mask = larval->mask & ~CRYPTO_ALG_TESTED; param->tb[0] = &param->type.attr; param->otype = larval->alg.cra_flags; param->omask = larval->mask; crypto_alg_get(&larval->alg); param->larval = larval; thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe"); if (IS_ERR(thread)) goto err_put_larval; return NOTIFY_STOP; err_put_larval: crypto_alg_put(&larval->alg); err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_test(void *data) { struct crypto_test_param *param = data; u32 type = param->type; int err; err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); crypto_alg_tested(param->driver, err); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_test(struct crypto_alg *alg) { struct task_struct *thread; struct crypto_test_param *param; if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return NOTIFY_DONE; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver)); memcpy(param->alg, alg->cra_name, sizeof(param->alg)); param->type = alg->cra_flags; thread = kthread_run(cryptomgr_test, param, "cryptomgr_test"); if (IS_ERR(thread)) goto err_free_param; return NOTIFY_STOP; err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_notify(struct notifier_block *this, unsigned long msg, void *data) { switch (msg) { case CRYPTO_MSG_ALG_REQUEST: return cryptomgr_schedule_probe(data); case CRYPTO_MSG_ALG_REGISTER: return cryptomgr_schedule_test(data); case CRYPTO_MSG_ALG_LOADED: break; } return NOTIFY_DONE; } static struct notifier_block cryptomgr_notifier = { .notifier_call = cryptomgr_notify, }; static int __init cryptomgr_init(void) { return crypto_register_notifier(&cryptomgr_notifier); } static void __exit cryptomgr_exit(void) { int err = crypto_unregister_notifier(&cryptomgr_notifier); BUG_ON(err); } /* * This is arch_initcall() so that the crypto self-tests are run on algorithms * registered early by subsys_initcall(). subsys_initcall() is needed for * generic implementations so that they're available for comparison tests when * other implementations are registered later by module_init(). */ arch_initcall(cryptomgr_init); module_exit(cryptomgr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto Algorithm Manager");
173 171 207 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion, adapted for tracing. * * Copyright (C) 2020 Paul E. McKenney. */ #ifndef __LINUX_RCUPDATE_TRACE_H #define __LINUX_RCUPDATE_TRACE_H #include <linux/sched.h> #include <linux/rcupdate.h> #include <linux/cleanup.h> extern struct lockdep_map rcu_trace_lock_map; #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline int rcu_read_lock_trace_held(void) { return lock_is_held(&rcu_trace_lock_map); } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ static inline int rcu_read_lock_trace_held(void) { return 1; } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_TASKS_TRACE_RCU void rcu_read_unlock_trace_special(struct task_struct *t); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section * * When synchronize_rcu_tasks_trace() is invoked by one task, then that * task is guaranteed to block until all other tasks exit their read-side * critical sections. Similarly, if call_rcu_trace() is invoked on one * task while other tasks are within RCU read-side critical sections, * invocation of the corresponding RCU callback is deferred until after * the all the other tasks exit their critical sections. * * For more details, please see the documentation for rcu_read_lock(). */ static inline void rcu_read_lock_trace(void) { struct task_struct *t = current; WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); barrier(); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers rcu_lock_acquire(&rcu_trace_lock_map); } /** * rcu_read_unlock_trace - mark end of RCU-trace read-side critical section * * Pairs with a preceding call to rcu_read_lock_trace(), and nesting is * allowed. Invoking a rcu_read_unlock_trace() when there is no matching * rcu_read_lock_trace() is verboten, and will result in lockdep complaints. * * For more details, please see the documentation for rcu_read_unlock(). */ static inline void rcu_read_unlock_trace(void) { int nesting; struct task_struct *t = current; rcu_lock_release(&rcu_trace_lock_map); nesting = READ_ONCE(t->trc_reader_nesting) - 1; barrier(); // Critical section before disabling. // Disable IPI-based setting of .need_qs. WRITE_ONCE(t->trc_reader_nesting, INT_MIN + nesting); if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) { WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. } WARN_ON_ONCE(nesting != 0); rcu_read_unlock_trace_special(t); } void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); void synchronize_rcu_tasks_trace(void); void rcu_barrier_tasks_trace(void); struct task_struct *get_rcu_tasks_trace_gp_kthread(void); #else /* * The BPF JIT forms these addresses even when it doesn't call these * functions, so provide definitions that result in runtime errors. */ static inline void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) { BUG(); } static inline void rcu_read_lock_trace(void) { BUG(); } static inline void rcu_read_unlock_trace(void) { BUG(); } #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ DEFINE_LOCK_GUARD_0(rcu_tasks_trace, rcu_read_lock_trace(), rcu_read_unlock_trace()) #endif /* __LINUX_RCUPDATE_TRACE_H */
19 19 19 35 74 50 68 6 68 68 28 34 11 45 35 45 14 45 21 76 30 50 49 22 49 54 2 8 45 45 44 76 74 21 76 68 68 45 51 7 5 51 15 28 16 51 51 51 51 52 52 19 9 9 25 25 17 3 52 28 51 51 52 19 12 19 19 19 3 18 77 25 52 16 50 4 50 16 25 26 78 26 53 72 71 13 13 12 12 85 13 72 85 13 72 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/cpumask.h> #include <linux/spinlock.h> #include <linux/percpu.h> #include "bpf_lru_list.h" #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET /* Helpers to get the local list index */ #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) static int get_next_cpu(int cpu) { cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_possible_mask); return cpu; } /* Local list helpers */ static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_FREE_LIST_IDX]; } static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; } /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { return READ_ONCE(node->ref); } static void bpf_lru_node_clear_ref(struct bpf_lru_node *node) { WRITE_ONCE(node->ref, 0); } static void bpf_lru_list_count_inc(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]++; } static void bpf_lru_list_count_dec(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]--; } static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, struct bpf_lru_node *node, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; /* If the removing node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; list_move(&node->list, free_list); } /* Move nodes from local list to the LRU list */ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); list_move(&node->list, &l->lists[tgt_type]); } /* Move nodes between or within active and inactive list (like * active to inactive, inactive to active or tail of active back to * the head of active). */ static void __bpf_lru_node_move(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; if (node->type != tgt_type) { bpf_lru_list_count_dec(l, node->type); bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; } bpf_lru_node_clear_ref(node); /* If the moving node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; list_move(&node->list, &l->lists[tgt_type]); } static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) { return l->counts[BPF_LRU_LIST_T_INACTIVE] < l->counts[BPF_LRU_LIST_T_ACTIVE]; } /* Rotate the active list: * 1. Start from tail * 2. If the node has the ref bit set, it will be rotated * back to the head of active list with the ref bit cleared. * Give this node one more chance to survive in the active list. * 3. If the ref bit is not set, move it to the head of the * inactive list. * 4. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; struct bpf_lru_node *node, *tmp_node, *first_node; unsigned int i = 0; first_node = list_first_entry(active, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, active, list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); if (++i == lru->nr_scans || node == first_node) break; } } /* Rotate the inactive list. It starts from the next_inactive_rotation * 1. If the node has ref bit set, it will be moved to the head * of active list with the ref bit cleared. * 2. If the node does not have ref bit set, it will leave it * at its current location (i.e. do nothing) so that it can * be considered during the next inactive_shrink. * 3. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct list_head *cur, *last, *next = inactive; struct bpf_lru_node *node; unsigned int i = 0; if (list_empty(inactive)) return; last = l->next_inactive_rotation->next; if (last == inactive) last = last->next; cur = l->next_inactive_rotation; while (i < lru->nr_scans) { if (cur == inactive) { cur = cur->prev; continue; } node = list_entry(cur, struct bpf_lru_node, list); next = cur->prev; if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); if (cur == last) break; cur = next; i++; } l->next_inactive_rotation = next; } /* Shrink the inactive list. It starts from the tail of the * inactive list and only move the nodes without the ref bit * set to the designated free list. */ static unsigned int __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); } else if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) break; } if (++i == lru->nr_scans) break; } return nshrinked; } /* 1. Rotate the active list (if needed) * 2. Always rotate the inactive list */ static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) { if (bpf_lru_list_inactive_low(l)) __bpf_lru_list_rotate_active(lru, l); __bpf_lru_list_rotate_inactive(lru, l); } /* Calls __bpf_lru_list_shrink_inactive() to shrink some * ref-bit-cleared nodes and move them to the designated * free list. * * If it cannot get a free node after calling * __bpf_lru_list_shrink_inactive(). It will just remove * one node from either inactive or active list without * honoring the ref-bit. It prefers inactive list to active * list in this situation. */ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct bpf_lru_node *node, *tmp_node; struct list_head *force_shrink_list; unsigned int nshrinked; nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, free_list, tgt_free_type); if (nshrinked) return nshrinked; /* Do a force shrink by ignoring the reference bit */ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; else force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; } } return 0; } /* Flush the nodes from the local pending list to the LRU list */ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, local_pending_list(loc_l), list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_INACTIVE); } } static void bpf_lru_list_push_free(struct bpf_lru_list *l, struct bpf_lru_node *node) { unsigned long flags; if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; raw_spin_lock(&l->lock); __local_list_flush(l, loc_l); __bpf_lru_list_rotate(lru, l); list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == LOCAL_FREE_TARGET) break; } if (nfree < LOCAL_FREE_TARGET) __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); raw_spin_unlock(&l->lock); } static void __local_list_add_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l, int cpu, struct bpf_lru_node *node, u32 hash) { *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; bpf_lru_node_clear_ref(node); list_add(&node->list, local_pending_list(loc_l)); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; node = list_first_entry_or_null(local_free_list(loc_l), struct bpf_lru_node, list); if (node) list_del(&node->list); return node; } static struct bpf_lru_node * __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ list_for_each_entry_reverse(node, local_pending_list(loc_l), list) { if ((!bpf_lru_node_is_ref(node) || force) && lru->del_from_htab(lru->del_arg, node)) { list_del(&node->list); return node; } } if (!force) { force = true; goto ignore_ref; } return NULL; } static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct list_head *free_list; struct bpf_lru_node *node = NULL; struct bpf_lru_list *l; unsigned long flags; int cpu = raw_smp_processor_id(); l = per_cpu_ptr(lru->percpu_lru, cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_list_rotate(lru, l); free_list = &l->lists[BPF_LRU_LIST_T_FREE]; if (list_empty(free_list)) __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, BPF_LRU_LIST_T_FREE); if (!list_empty(free_list)) { node = list_first_entry(free_list, struct bpf_lru_node, list); *(u32 *)((void *)node + lru->hash_offset) = hash; bpf_lru_node_clear_ref(node); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } raw_spin_unlock_irqrestore(&l->lock, flags); return node; } static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct bpf_lru_locallist *loc_l, *steal_loc_l; struct bpf_common_lru *clru = &lru->common_lru; struct bpf_lru_node *node; int steal, first_steal; unsigned long flags; int cpu = raw_smp_processor_id(); loc_l = per_cpu_ptr(clru->local_list, cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); node = __local_list_pop_free(loc_l); if (!node) { bpf_lru_list_pop_free_to_local(lru, loc_l); node = __local_list_pop_free(loc_l); } if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; /* No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. */ first_steal = loc_l->next_steal; steal = first_steal; do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); raw_spin_lock_irqsave(&steal_loc_l->lock, flags); node = __local_list_pop_free(steal_loc_l); if (!node) node = __local_list_pop_pending(lru, steal_loc_l); raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); steal = get_next_cpu(steal); } while (!node && steal != first_steal); loc_l->next_steal = steal; if (node) { raw_spin_lock_irqsave(&loc_l->lock, flags); __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); } return node; } struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) { if (lru->percpu) return bpf_percpu_lru_pop_free(lru, hash); else return bpf_common_lru_pop_free(lru, hash); } static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { u8 node_type = READ_ONCE(node->type); unsigned long flags; if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { raw_spin_unlock_irqrestore(&loc_l->lock, flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l)); raw_spin_unlock_irqrestore(&loc_l->lock, flags); return; } check_lru_list: bpf_lru_list_push_free(&lru->common_lru.lru_list, node); } static void bpf_percpu_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { struct bpf_lru_list *l; unsigned long flags; l = per_cpu_ptr(lru->percpu_lru, node->cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { if (lru->percpu) bpf_percpu_lru_push_free(lru, node); else bpf_common_lru_push_free(lru, node); } static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; for (i = 0; i < nr_elems; i++) { struct bpf_lru_node *node; node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { u32 i, pcpu_entries; int cpu; struct bpf_lru_list *l; pcpu_entries = nr_elems / num_possible_cpus(); i = 0; for_each_possible_cpu(cpu) { struct bpf_lru_node *node; l = per_cpu_ptr(lru->percpu_lru, cpu); again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; buf += elem_size; if (i == nr_elems) break; if (i % pcpu_entries) goto again; } } void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { if (lru->percpu) bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, nr_elems); else bpf_common_lru_populate(lru, buf, node_offset, elem_size, nr_elems); } static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { int i; for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) INIT_LIST_HEAD(&loc_l->lists[i]); loc_l->next_steal = cpu; raw_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) { int i; for (i = 0; i < NR_BPF_LRU_LIST_T; i++) INIT_LIST_HEAD(&l->lists[i]); for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) l->counts[i] = 0; l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; raw_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *del_arg) { int cpu; if (percpu) { lru->percpu_lru = alloc_percpu(struct bpf_lru_list); if (!lru->percpu_lru) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_list *l; l = per_cpu_ptr(lru->percpu_lru, cpu); bpf_lru_list_init(l); } lru->nr_scans = PERCPU_NR_SCANS; } else { struct bpf_common_lru *clru = &lru->common_lru; clru->local_list = alloc_percpu(struct bpf_lru_locallist); if (!clru->local_list) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(clru->local_list, cpu); bpf_lru_locallist_init(loc_l, cpu); } bpf_lru_list_init(&clru->lru_list); lru->nr_scans = LOCAL_NR_SCANS; } lru->percpu = percpu; lru->del_from_htab = del_from_htab; lru->del_arg = del_arg; lru->hash_offset = hash_offset; return 0; } void bpf_lru_destroy(struct bpf_lru *lru) { if (lru->percpu) free_percpu(lru->percpu_lru); else free_percpu(lru->common_lru.local_list); }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/in.h> #include <linux/ipv6.h> #include "rds.h" #include "loop.h" static char * const rds_trans_modules[] = { [RDS_TRANS_IB] = "rds_rdma", [RDS_TRANS_GAP] = NULL, [RDS_TRANS_TCP] = "rds_tcp", }; static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); void rds_trans_register(struct rds_transport *trans) { BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); down_write(&rds_trans_sem); if (transports[trans->t_type]) printk(KERN_ERR "RDS Transport type %d already registered\n", trans->t_type); else { transports[trans->t_type] = trans; printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); } up_write(&rds_trans_sem); } EXPORT_SYMBOL_GPL(rds_trans_register); void rds_trans_unregister(struct rds_transport *trans) { down_write(&rds_trans_sem); transports[trans->t_type] = NULL; printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); up_write(&rds_trans_sem); } EXPORT_SYMBOL_GPL(rds_trans_unregister); void rds_trans_put(struct rds_transport *trans) { if (trans) module_put(trans->t_owner); } struct rds_transport *rds_trans_get_preferred(struct net *net, const struct in6_addr *addr, __u32 scope_id) { struct rds_transport *ret = NULL; struct rds_transport *trans; unsigned int i; if (ipv6_addr_v4mapped(addr)) { if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET) return &rds_loop_transport; } else if (ipv6_addr_loopback(addr)) { return &rds_loop_transport; } down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (trans && (trans->laddr_check(net, addr, scope_id) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; } } up_read(&rds_trans_sem); return ret; } struct rds_transport *rds_trans_get(int t_type) { struct rds_transport *ret = NULL; struct rds_transport *trans; down_read(&rds_trans_sem); trans = transports[t_type]; if (!trans) { up_read(&rds_trans_sem); if (rds_trans_modules[t_type]) request_module(rds_trans_modules[t_type]); down_read(&rds_trans_sem); trans = transports[t_type]; } if (trans && trans->t_type == t_type && (!trans->t_owner || try_module_get(trans->t_owner))) ret = trans; up_read(&rds_trans_sem); return ret; } /* * This returns the number of stats entries in the snapshot and only * copies them using the iter if there is enough space for them. The * caller passes in the global stats so that we can size and copy while * holding the lock. */ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_transport *trans; unsigned int total = 0; unsigned int part; int i; rds_info_iter_unmap(iter); down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (!trans || !trans->stats_info_copy) continue; part = trans->stats_info_copy(iter, avail); avail -= min(avail, part); total += part; } up_read(&rds_trans_sem); return total; }
16 10 6 17 7 9 1 1 8 8 2 2 6 6 6 6 3 3 3 5 1 6 2 9 9 9 9 5 5 5 5 5 5 5 2 3 4 1 5 16 16 15 3 16 3 16 5 5 1 1 1 6 8 2 6 12 5 11 4 10 7 16 14 14 2 16 16 16 16 15 4 2 2 1 1 1 5 5 3 4 29 29 1 15 7 4 33 5 5 4 3 4 4 8 8 8 8 30 26 26 26 19 8 1041 1038 16 15 12 1 1 1 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> * * The code this is based on carried the following copyright notice: * --- * (C) Copyright 2001-2006 * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com * Re-worked by Ben Greear <greearb@candelatech.com> * --- */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/net_tstamp.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/if_link.h> #include <linux/if_macvlan.h> #include <linux/hash.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <linux/netpoll.h> #include <linux/phy.h> #define MACVLAN_HASH_BITS 8 #define MACVLAN_HASH_SIZE (1<<MACVLAN_HASH_BITS) #define MACVLAN_DEFAULT_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 #define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; struct list_head vlans; struct sk_buff_head bc_queue; struct work_struct bc_work; u32 bc_queue_len_used; int bc_cutoff; u32 flags; int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(bc_filter, MACVLAN_MC_FILTER_SZ); DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { struct hlist_node hlist; struct macvlan_dev *vlan; unsigned char addr[6+2] __aligned(sizeof(u16)); struct rcu_head rcu; }; struct macvlan_skb_cb { const struct macvlan_dev *src; }; #define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0])) static void macvlan_port_destroy(struct net_device *dev); static void update_port_bc_queue_len(struct macvlan_port *port); static inline bool macvlan_passthru(const struct macvlan_port *port) { return port->flags & MACVLAN_F_PASSTHRU; } static inline void macvlan_set_passthru(struct macvlan_port *port) { port->flags |= MACVLAN_F_PASSTHRU; } static inline bool macvlan_addr_change(const struct macvlan_port *port) { return port->flags & MACVLAN_F_ADDRCHANGE; } static inline void macvlan_set_addr_change(struct macvlan_port *port) { port->flags |= MACVLAN_F_ADDRCHANGE; } static inline void macvlan_clear_addr_change(struct macvlan_port *port) { port->flags &= ~MACVLAN_F_ADDRCHANGE; } /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, MACVLAN_HASH_BITS); } static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static struct macvlan_port *macvlan_port_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; u32 idx = macvlan_eth_hash(addr); hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } static struct macvlan_source_entry *macvlan_hash_lookup_source( const struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(entry->addr, addr) && entry->vlan == vlan) return entry; } return NULL; } static int macvlan_hash_add_source(struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_port *port = vlan->port; struct macvlan_source_entry *entry; struct hlist_head *h; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) return 0; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; ether_addr_copy(entry->addr, addr); entry->vlan = vlan; h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; hlist_add_head_rcu(&entry->hlist, h); vlan->macaddr_count++; return 0; } static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; u32 idx = macvlan_eth_hash(addr); hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); } static void macvlan_hash_del_source(struct macvlan_source_entry *entry) { hlist_del_rcu(&entry->hlist); kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) { hlist_del_rcu(&vlan->hlist); if (sync) synchronize_rcu(); } static void macvlan_hash_change_addr(struct macvlan_dev *vlan, const unsigned char *addr) { macvlan_hash_del(vlan, true); /* Now that we are unhashed it is safe to change the device * address without confusing packet delivery. */ eth_hw_addr_set(vlan->dev, addr); macvlan_hash_add(vlan); } static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; if (macvlan_hash_lookup(port, addr)) return true; return false; } static int macvlan_broadcast_one(struct sk_buff *skb, const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { struct net_device *dev = vlan->dev; if (local) return __dev_forward_skb(dev, skb); skb->dev = dev; if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; return 0; } static u32 macvlan_hash_mix(const struct macvlan_dev *vlan) { return (u32)(((unsigned long)vlan) >> L1_CACHE_SHIFT); } static unsigned int mc_hash(const struct macvlan_dev *vlan, const unsigned char *addr) { u32 val = __get_unaligned_cpu32(addr + 2); val ^= macvlan_hash_mix(vlan); return hash_32(val, MACVLAN_MC_FILTER_BITS); } static void macvlan_broadcast(struct sk_buff *skb, const struct macvlan_port *port, struct net_device *src, enum macvlan_mode mode) { const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; struct sk_buff *nskb; unsigned int i; int err; unsigned int hash; if (skb->protocol == htons(ETH_P_PAUSE)) return; hash_for_each_rcu(port->vlan_hash, i, vlan, hlist) { if (vlan->dev == src || !(vlan->mode & mode)) continue; hash = mc_hash(vlan, eth->h_dest); if (!test_bit(hash, vlan->mc_filter)) continue; err = NET_RX_DROP; nskb = skb_clone(skb, GFP_ATOMIC); if (likely(nskb)) err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE) ?: netif_rx(nskb); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, true); } } static void macvlan_multicast_rx(const struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { if (!src) /* frame comes from an external address */ macvlan_broadcast(skb, port, NULL, MACVLAN_MODE_PRIVATE | MACVLAN_MODE_VEPA | MACVLAN_MODE_PASSTHRU| MACVLAN_MODE_BRIDGE); else if (src->mode == MACVLAN_MODE_VEPA) /* flood to everyone except source */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA | MACVLAN_MODE_BRIDGE); else /* * flood only to VEPA ports, bridge ports * already saw the frame on the way out. */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA); } static void macvlan_process_broadcast(struct work_struct *w) { struct macvlan_port *port = container_of(w, struct macvlan_port, bc_work); struct sk_buff *skb; struct sk_buff_head list; __skb_queue_head_init(&list); spin_lock_bh(&port->bc_queue.lock); skb_queue_splice_tail_init(&port->bc_queue, &list); spin_unlock_bh(&port->bc_queue.lock); while ((skb = __skb_dequeue(&list))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; rcu_read_lock(); macvlan_multicast_rx(port, src, skb); rcu_read_unlock(); if (src) dev_put(src->dev); consume_skb(skb); cond_resched(); } } static void macvlan_broadcast_enqueue(struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { struct sk_buff *nskb; int err = -ENOMEM; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) goto err; MACVLAN_SKB_CB(nskb)->src = src; spin_lock(&port->bc_queue.lock); if (skb_queue_len(&port->bc_queue) < port->bc_queue_len_used) { if (src) dev_hold(src->dev); __skb_queue_tail(&port->bc_queue, nskb); err = 0; } spin_unlock(&port->bc_queue.lock); queue_work(system_unbound_wq, &port->bc_work); if (err) goto free_nskb; return; free_nskb: kfree_skb(nskb); err: dev_core_stats_rx_dropped_inc(skb->dev); } static void macvlan_flush_sources(struct macvlan_port *port, struct macvlan_dev *vlan) { struct macvlan_source_entry *entry; struct hlist_node *next; int i; hash_for_each_safe(port->vlan_source_hash, i, next, entry, hlist) if (entry->vlan == vlan) macvlan_hash_del_source(entry); vlan->macaddr_count = 0; } static void macvlan_forward_source_one(struct sk_buff *skb, struct macvlan_dev *vlan) { struct sk_buff *nskb; struct net_device *dev; int len; int ret; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; len = nskb->len + ETH_HLEN; nskb->dev = dev; if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr)) nskb->pkt_type = PACKET_HOST; ret = __netif_rx(nskb); macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { if (ether_addr_equal_64bits(entry->addr, addr)) { if (entry->vlan->flags & MACVLAN_FLAG_NODST) consume = true; macvlan_forward_source_one(skb, entry->vlan); } } return consume; } /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { struct macvlan_port *port; struct sk_buff *skb = *pskb; const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; const struct macvlan_dev *src; struct net_device *dev; unsigned int len = 0; int ret; rx_handler_result_t handle_res; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; port = macvlan_port_get_rcu(skb->dev); if (is_multicast_ether_addr(eth->h_dest)) { unsigned int hash; skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { /* forward to original port. */ vlan = src; ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?: __netif_rx(skb); handle_res = RX_HANDLER_CONSUMED; goto out; } hash = mc_hash(NULL, eth->h_dest); if (test_bit(hash, port->bc_filter)) macvlan_broadcast_enqueue(port, src, skb); else if (test_bit(hash, port->mc_filter)) macvlan_multicast_rx(port, src, skb); return RX_HANDLER_PASS; } if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); else vlan = macvlan_hash_lookup(port, eth->h_dest); if (!vlan || vlan->mode == MACVLAN_MODE_SOURCE) return RX_HANDLER_PASS; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } len = skb->len + ETH_HLEN; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { ret = NET_RX_DROP; handle_res = RX_HANDLER_CONSUMED; goto out; } *pskb = skb; skb->dev = dev; skb->pkt_type = PACKET_HOST; ret = NET_RX_SUCCESS; handle_res = RX_HANDLER_ANOTHER; out: macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); return handle_res; } static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { const struct macvlan_dev *vlan = netdev_priv(dev); const struct macvlan_port *port = vlan->port; const struct macvlan_dev *dest; if (vlan->mode == MACVLAN_MODE_BRIDGE) { const struct ethhdr *eth = skb_eth_hdr(skb); /* send to other bridge ports directly */ if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE); goto xmit_world; } dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { /* send to lowerdev first for its network taps */ dev_forward_skb(vlan->lowerdev, skb); return NET_XMIT_SUCCESS; } } xmit_world: skb->dev = vlan->lowerdev; return dev_queue_xmit_accel(skb, netdev_get_sb_channel(dev) ? dev : NULL); } static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); unsigned int len = skb->len; int ret; if (unlikely(netpoll_tx_running(dev))) return macvlan_netpoll_send_skb(vlan, skb); ret = macvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(vlan->pcpu_stats->tx_dropped); } return ret; } static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return dev_hard_header(skb, lowerdev, type, daddr, saddr ? : dev->dev_addr, len); } static const struct header_ops macvlan_hard_header_ops = { .create = macvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; static int macvlan_open(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; int err; if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto out; } goto hash_add; } err = -EADDRINUSE; if (macvlan_addr_busy(vlan->port, dev->dev_addr)) goto out; /* Attempt to populate accel_priv which is used to offload the L2 * forwarding requests for unicast packets. */ if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) vlan->accel_priv = lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev); /* If earlier attempt to offload failed, or accel_priv is not * populated we must add the unicast address to the lower device. */ if (IS_ERR_OR_NULL(vlan->accel_priv)) { vlan->accel_priv = NULL; err = dev_uc_add(lowerdev, dev->dev_addr); if (err < 0) goto out; } if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(lowerdev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto clear_multi; } hash_add: macvlan_hash_add(vlan); return 0; clear_multi: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); del_unicast: if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } else { dev_uc_del(lowerdev, dev->dev_addr); } out: return err; } static int macvlan_stop(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } dev_uc_unsync(lowerdev, dev); dev_mc_unsync(lowerdev, dev); if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) dev_set_promiscuity(lowerdev, -1); goto hash_del; } if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(lowerdev, -1); dev_uc_del(lowerdev, dev->dev_addr); hash_del: macvlan_hash_del(vlan, !dev->dismantle); return 0; } static int macvlan_sync_address(struct net_device *dev, const unsigned char *addr) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; int err; if (!(dev->flags & IFF_UP)) { /* Just copy in the new address */ eth_hw_addr_set(dev, addr); } else { /* Rehash and update the device filters */ if (macvlan_addr_busy(vlan->port, addr)) return -EADDRINUSE; if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; dev_uc_del(lowerdev, dev->dev_addr); } macvlan_hash_change_addr(vlan, addr); } if (macvlan_passthru(port) && !macvlan_addr_change(port)) { /* Since addr_change isn't set, we are here due to lower * device change. Save the lower-dev address so we can * restore it later. */ ether_addr_copy(vlan->port->perm_addr, lowerdev->dev_addr); } macvlan_clear_addr_change(port); return 0; } static int macvlan_set_mac_address(struct net_device *dev, void *p) { struct macvlan_dev *vlan = netdev_priv(dev); struct sockaddr *addr = p; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; /* If the addresses are the same, this is a no-op */ if (ether_addr_equal(dev->dev_addr, addr->sa_data)) return 0; if (vlan->mode == MACVLAN_MODE_PASSTHRU) { macvlan_set_addr_change(vlan->port); return dev_set_mac_address(vlan->lowerdev, addr, NULL); } if (macvlan_addr_busy(vlan->port, addr->sa_data)) return -EADDRINUSE; return macvlan_sync_address(dev, addr->sa_data); } static void macvlan_change_rx_flags(struct net_device *dev, int change) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC) dev_set_promiscuity(lowerdev, dev->flags & IFF_PROMISC ? 1 : -1); } } static void macvlan_compute_filter(unsigned long *mc_filter, struct net_device *dev, struct macvlan_dev *vlan, int cutoff) { if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ); } else { DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ); struct netdev_hw_addr *ha; bitmap_zero(filter, MACVLAN_MC_FILTER_SZ); netdev_for_each_mc_addr(ha, dev) { if (!vlan && ha->synced <= cutoff) continue; __set_bit(mc_hash(vlan, ha->addr), filter); } __set_bit(mc_hash(vlan, dev->broadcast), filter); bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ); } } static void macvlan_recompute_bc_filter(struct macvlan_dev *vlan) { if (vlan->port->bc_cutoff < 0) { bitmap_zero(vlan->port->bc_filter, MACVLAN_MC_FILTER_SZ); return; } macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL, vlan->port->bc_cutoff); } static void macvlan_set_mac_lists(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); macvlan_compute_filter(vlan->mc_filter, dev, vlan, 0); dev_uc_sync(vlan->lowerdev, dev); dev_mc_sync(vlan->lowerdev, dev); /* This is slightly inaccurate as we're including the subscription * list of vlan->lowerdev too. * * Bug alert: This only works if everyone has the same broadcast * address as lowerdev. As soon as someone changes theirs this * will break. * * However, this is already broken as when you change your broadcast * address we don't get called. * * The solution is to maintain a list of broadcast addresses like * we do for uc/mc, if you care. */ macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL, 0); macvlan_recompute_bc_filter(vlan); } static void update_port_bc_cutoff(struct macvlan_dev *vlan, int cutoff) { if (vlan->port->bc_cutoff == cutoff) return; vlan->port->bc_cutoff = cutoff; macvlan_recompute_bc_filter(vlan); } static int macvlan_change_mtu(struct net_device *dev, int new_mtu) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->lowerdev->mtu < new_mtu) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int macvlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return generic_hwtstamp_get_lower(real_dev, cfg); } static int macvlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = macvlan_dev_real_dev(dev); if (!net_eq(dev_net(dev), &init_net)) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } /* * macvlan network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key macvlan_netdev_addr_lock_key; #define ALWAYS_ON_OFFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) #define ALWAYS_ON_FEATURES ALWAYS_ON_OFFLOADS #define MACVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_LRO | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static void macvlan_set_lockdep_class(struct net_device *dev) { netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macvlan_netdev_addr_lock_key); } static int macvlan_init(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; dev->state = (dev->state & ~MACVLAN_STATE_MASK) | (lowerdev->state & MACVLAN_STATE_MASK); dev->features = lowerdev->features & MACVLAN_FEATURES; dev->features |= ALWAYS_ON_FEATURES; dev->hw_features |= NETIF_F_LRO; dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; dev->lltx = true; netif_inherit_tso_max(dev, lowerdev); dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; port->count += 1; /* Get macvlan's reference to lowerdev */ netdev_hold(lowerdev, &vlan->dev_tracker, GFP_KERNEL); return 0; } static void macvlan_uninit(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; free_percpu(vlan->pcpu_stats); macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); } static void macvlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->pcpu_stats) { struct vlan_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_errors = 0, tx_dropped = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(vlan->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* rx_errors & tx_dropped are u32, updated * without syncp protection. */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->rx_dropped = rx_errors; stats->tx_dropped = tx_dropped; } } static int macvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return vlan_vid_add(lowerdev, proto, vid); } static int macvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; vlan_vid_del(lowerdev, proto, vid); return 0; } static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); return err; } static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } static void macvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "macvlan", sizeof(drvinfo->driver)); strscpy(drvinfo->version, "0.1", sizeof(drvinfo->version)); } static int macvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct macvlan_dev *vlan = netdev_priv(dev); return __ethtool_get_link_ksettings(vlan->lowerdev, cmd); } static int macvlan_ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return ethtool_get_ts_info_by_layer(real_dev, info); } static netdev_features_t macvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct macvlan_dev *vlan = netdev_priv(dev); netdev_features_t lowerdev_features = vlan->lowerdev->features; netdev_features_t mask; features |= NETIF_F_ALL_FOR_ALL; features &= (vlan->set_features | ~MACVLAN_FEATURES); mask = features; lowerdev_features &= (features | ~NETIF_F_LRO); features = netdev_increment_features(lowerdev_features, features, mask); features |= ALWAYS_ON_FEATURES; features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES); return features; } #ifdef CONFIG_NET_POLL_CONTROLLER static void macvlan_dev_poll_controller(struct net_device *dev) { return; } static int macvlan_dev_netpoll_setup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *real_dev = vlan->lowerdev; struct netpoll *netpoll; int err; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void macvlan_dev_netpoll_cleanup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int macvlan_dev_get_iflink(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return READ_ONCE(vlan->lowerdev->ifindex); } static const struct ethtool_ops macvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = macvlan_ethtool_get_link_ksettings, .get_drvinfo = macvlan_ethtool_get_drvinfo, .get_ts_info = macvlan_ethtool_get_ts_info, }; static const struct net_device_ops macvlan_netdev_ops = { .ndo_init = macvlan_init, .ndo_uninit = macvlan_uninit, .ndo_open = macvlan_open, .ndo_stop = macvlan_stop, .ndo_start_xmit = macvlan_start_xmit, .ndo_change_mtu = macvlan_change_mtu, .ndo_fix_features = macvlan_fix_features, .ndo_change_rx_flags = macvlan_change_rx_flags, .ndo_set_mac_address = macvlan_set_mac_address, .ndo_set_rx_mode = macvlan_set_mac_lists, .ndo_get_stats64 = macvlan_dev_get_stats64, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = macvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = macvlan_vlan_rx_kill_vid, .ndo_fdb_add = macvlan_fdb_add, .ndo_fdb_del = macvlan_fdb_del, .ndo_fdb_dump = ndo_dflt_fdb_dump, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = macvlan_dev_poll_controller, .ndo_netpoll_setup = macvlan_dev_netpoll_setup, .ndo_netpoll_cleanup = macvlan_dev_netpoll_cleanup, #endif .ndo_get_iflink = macvlan_dev_get_iflink, .ndo_features_check = passthru_features_check, .ndo_hwtstamp_get = macvlan_hwtstamp_get, .ndo_hwtstamp_set = macvlan_hwtstamp_set, }; static void macvlan_dev_free(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); /* Get rid of the macvlan's reference to lowerdev */ netdev_put(vlan->lowerdev, &vlan->dev_tracker); } void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); /* ether_setup() has set dev->min_mtu to ETH_MIN_MTU. */ dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT; dev->change_proto_down = true; dev->netdev_ops = &macvlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = macvlan_dev_free; dev->header_ops = &macvlan_hard_header_ops; dev->ethtool_ops = &macvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(macvlan_common_setup); static void macvlan_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; } static int macvlan_port_create(struct net_device *dev) { struct macvlan_port *port; unsigned int i; int err; if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) return -EINVAL; if (netdev_is_rx_handler_busy(dev)) return -EBUSY; port = kzalloc(sizeof(*port), GFP_KERNEL); if (port == NULL) return -ENOMEM; port->dev = dev; ether_addr_copy(port->perm_addr, dev->dev_addr); INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_source_hash[i]); port->bc_queue_len_used = 0; port->bc_cutoff = 1; skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); if (err) kfree(port); else dev->priv_flags |= IFF_MACVLAN_PORT; return err; } static void macvlan_port_destroy(struct net_device *dev) { struct macvlan_port *port = macvlan_port_get_rtnl(dev); struct sk_buff *skb; dev->priv_flags &= ~IFF_MACVLAN_PORT; netdev_rx_handler_unregister(dev); /* After this point, no packet can schedule bc_work anymore, * but we need to cancel it and purge left skbs if any. */ cancel_work_sync(&port->bc_work); while ((skb = __skb_dequeue(&port->bc_queue))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; if (src) dev_put(src->dev); kfree_skb(skb); } /* If the lower device address has been changed by passthru * macvlan, put it back. */ if (macvlan_passthru(port) && !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) { struct sockaddr sa; sa.sa_family = port->dev->type; memcpy(&sa.sa_data, port->perm_addr, port->dev->addr_len); dev_set_mac_address(port->dev, &sa, NULL); } kfree(port); } static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *nla, *head; int rem, len; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; if (data[IFLA_MACVLAN_FLAGS] && nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) { case MACVLAN_MODE_PRIVATE: case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: case MACVLAN_MODE_SOURCE: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { case MACVLAN_MACADDR_ADD: case MACVLAN_MACADDR_DEL: case MACVLAN_MACADDR_FLUSH: case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR]) { if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) return -EADDRNOTAVAIL; } if (data[IFLA_MACVLAN_MACADDR_DATA]) { head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { if (nla_type(nla) != IFLA_MACVLAN_MACADDR || nla_len(nla) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(nla))) return -EADDRNOTAVAIL; } } if (data[IFLA_MACVLAN_MACADDR_COUNT]) return -EINVAL; return 0; } /* * reconfigure list of remote source mac address * (only for macvlan devices in source mode) * Note regarding alignment: all netlink data is aligned to 4 Byte, which * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. */ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, struct nlattr *data[]) { char *addr = NULL; int ret, rem, len; struct nlattr *nla, *head; struct macvlan_source_entry *entry; if (data[IFLA_MACVLAN_MACADDR]) addr = nla_data(data[IFLA_MACVLAN_MACADDR]); if (mode == MACVLAN_MACADDR_ADD) { if (!addr) return -EINVAL; return macvlan_hash_add_source(vlan, addr); } else if (mode == MACVLAN_MACADDR_DEL) { if (!addr) return -EINVAL; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) { macvlan_hash_del_source(entry); vlan->macaddr_count--; } } else if (mode == MACVLAN_MACADDR_FLUSH) { macvlan_flush_sources(vlan->port, vlan); } else if (mode == MACVLAN_MACADDR_SET) { macvlan_flush_sources(vlan->port, vlan); if (addr) { ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } if (!data[IFLA_MACVLAN_MACADDR_DATA]) return 0; head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { addr = nla_data(nla); ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } } else { return -EINVAL; } return 0; } int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port; struct net_device *lowerdev; int err; int macmode; bool create = false; if (!tb[IFLA_LINK]) return -EINVAL; lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (lowerdev == NULL) return -ENODEV; /* When creating macvlans or macvtaps on top of other macvlans - use * the real device as the lowerdev. */ if (netif_is_macvlan(lowerdev)) lowerdev = macvlan_dev_real_dev(lowerdev); if (!tb[IFLA_MTU]) dev->mtu = lowerdev->mtu; else if (dev->mtu > lowerdev->mtu) return -EINVAL; /* MTU range: 68 - lowerdev->max_mtu */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = lowerdev->max_mtu; if (!tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); if (!netif_is_macvlan_port(lowerdev)) { err = macvlan_port_create(lowerdev); if (err < 0) return err; create = true; } port = macvlan_port_get_rtnl(lowerdev); /* Only 1 macvlan device can be created in passthru mode */ if (macvlan_passthru(port)) { /* The macvlan port must be not created this time, * still goto destroy_macvlan_port for readability. */ err = -EINVAL; goto destroy_macvlan_port; } vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; vlan->set_features = MACVLAN_FEATURES; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); if (data && data[IFLA_MACVLAN_FLAGS]) vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { if (port->count) { err = -EINVAL; goto destroy_macvlan_port; } macvlan_set_passthru(port); eth_hw_addr_inherit(dev, lowerdev); } if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) { err = -EINVAL; goto destroy_macvlan_port; } macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); err = macvlan_changelink_sources(vlan, macmode, data); if (err) goto destroy_macvlan_port; } vlan->bc_queue_len_req = MACVLAN_DEFAULT_BC_QUEUE_LEN; if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); err = register_netdevice(dev); if (err < 0) goto destroy_macvlan_port; dev->priv_flags |= IFF_MACVLAN; err = netdev_upper_dev_link(lowerdev, dev, extack); if (err) goto unregister_netdev; list_add_tail_rcu(&vlan->list, &port->vlans); update_port_bc_queue_len(vlan->port); netif_stacked_transfer_operstate(lowerdev, dev); linkwatch_fire_event(dev); return 0; unregister_netdev: /* macvlan_uninit would free the macvlan port */ unregister_netdevice(dev); return err; destroy_macvlan_port: /* the macvlan port may be freed by macvlan_uninit when fail to register. * so we destroy the macvlan port only when it's valid. */ if (create && macvlan_port_get_rtnl(lowerdev)) { macvlan_flush_sources(port, vlan); macvlan_port_destroy(port->dev); } return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); static int macvlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return macvlan_common_newlink(src_net, dev, tb, data, extack); } void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->mode == MACVLAN_MODE_SOURCE) macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); update_port_bc_queue_len(vlan->port); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); } EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; enum macvlan_macaddr_mode macmode; int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { set_mode = true; mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); /* Passthrough mode can't be set or cleared dynamically */ if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; if (vlan->mode == MACVLAN_MODE_SOURCE && vlan->mode != mode) macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { __u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC; if (macvlan_passthru(vlan->port) && promisc) { int err; if (flags & MACVLAN_FLAG_NOPROMISC) err = dev_set_promiscuity(vlan->lowerdev, -1); else err = dev_set_promiscuity(vlan->lowerdev, 1); if (err < 0) return err; } vlan->flags = flags; } if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) { vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); update_port_bc_queue_len(vlan->port); } if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); if (set_mode) vlan->mode = mode; if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) return -EINVAL; macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); ret = macvlan_changelink_sources(vlan, macmode, data); if (ret) return ret; } return 0; } static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) { if (vlan->macaddr_count == 0) return 0; return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); } static size_t macvlan_get_size(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN_USED */ ); } static int macvlan_fill_info_macaddr(struct sk_buff *skb, const struct macvlan_dev *vlan, const int i) { struct hlist_head *h = &vlan->port->vlan_source_hash[i]; struct macvlan_source_entry *entry; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (entry->vlan != vlan) continue; if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) return 1; } return 0; } static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; int i; struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) goto nla_put_failure; if (vlan->macaddr_count > 0) { nest = nla_nest_start_noflag(skb, IFLA_MACVLAN_MACADDR_DATA); if (nest == NULL) goto nla_put_failure; for (i = 0; i < MACVLAN_HASH_SIZE; i++) { if (macvlan_fill_info_macaddr(skb, vlan, i)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN, vlan->bc_queue_len_req)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used)) goto nla_put_failure; if (port->bc_cutoff != 1 && nla_put_s32(skb, IFLA_MACVLAN_BC_CUTOFF, port->bc_cutoff)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN_USED] = { .type = NLA_REJECT }, [IFLA_MACVLAN_BC_CUTOFF] = { .type = NLA_S32 }, }; int macvlan_link_register(struct rtnl_link_ops *ops) { /* common fields */ ops->validate = macvlan_validate; ops->maxtype = IFLA_MACVLAN_MAX; ops->policy = macvlan_policy; ops->changelink = macvlan_changelink; ops->get_size = macvlan_get_size; ops->fill_info = macvlan_fill_info; return rtnl_link_register(ops); }; EXPORT_SYMBOL_GPL(macvlan_link_register); static struct net *macvlan_get_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", .setup = macvlan_setup, .newlink = macvlan_newlink, .dellink = macvlan_dellink, .get_link_net = macvlan_get_link_net, .priv_size = sizeof(struct macvlan_dev), }; static void update_port_bc_queue_len(struct macvlan_port *port) { u32 max_bc_queue_len_req = 0; struct macvlan_dev *vlan; list_for_each_entry(vlan, &port->vlans, list) { if (vlan->bc_queue_len_req > max_bc_queue_len_req) max_bc_queue_len_req = vlan->bc_queue_len_req; } port->bc_queue_len_used = max_bc_queue_len_req; } static int macvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvlan_dev *vlan, *next; struct macvlan_port *port; LIST_HEAD(list_kill); if (!netif_is_macvlan_port(dev)) return NOTIFY_DONE; port = macvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: list_for_each_entry(vlan, &port->vlans, list) netif_stacked_transfer_operstate(vlan->lowerdev, vlan->dev); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { netif_inherit_tso_max(vlan->dev, dev); netdev_update_features(vlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(vlan, &port->vlans, list) { if (vlan->dev->mtu <= dev->mtu) continue; dev_set_mtu(vlan->dev, dev->mtu); } break; case NETDEV_CHANGEADDR: if (!macvlan_passthru(port)) return NOTIFY_DONE; vlan = list_first_entry_or_null(&port->vlans, struct macvlan_dev, list); if (vlan && macvlan_sync_address(vlan->dev, dev->dev_addr)) return NOTIFY_BAD; break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(vlan, next, &port->vlans, list) vlan->dev->rtnl_link_ops->dellink(vlan->dev, &list_kill); unregister_netdevice_many(&list_kill); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to all vlans */ list_for_each_entry(vlan, &port->vlans, list) call_netdevice_notifiers(event, vlan->dev); } return NOTIFY_DONE; } static struct notifier_block macvlan_notifier_block __read_mostly = { .notifier_call = macvlan_device_event, }; static int __init macvlan_init_module(void) { int err; register_netdevice_notifier(&macvlan_notifier_block); err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; err1: unregister_netdevice_notifier(&macvlan_notifier_block); return err; } static void __exit macvlan_cleanup_module(void) { rtnl_link_unregister(&macvlan_link_ops); unregister_netdevice_notifier(&macvlan_notifier_block); } module_init(macvlan_init_module); module_exit(macvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Driver for MAC address based VLANs"); MODULE_ALIAS_RTNL_LINK("macvlan");
7 7 1 1 1 1 1 1 2 2 2 2 1 1 3 3 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 // SPDX-License-Identifier: GPL-2.0-only /* L2TP netlink layer, for management * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * * Partly based on the IrDA nelink implementation * (see net/irda/irnetlink.c) which is: * Copyright (c) 2007 Samuel Ortiz <samuel@sortiz.org> * which is in turn partly based on the wireless netlink code: * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <net/sock.h> #include <net/genetlink.h> #include <net/udp.h> #include <linux/in.h> #include <linux/udp.h> #include <linux/socket.h> #include <linux/module.h> #include <linux/list.h> #include <net/net_namespace.h> #include <linux/l2tp.h> #include "l2tp_core.h" static struct genl_family l2tp_nl_family; static const struct genl_multicast_group l2tp_multicast_group[] = { { .name = L2TP_GENL_MCGROUP, }, }; static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_tunnel *tunnel, u8 cmd); static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_session *session, u8 cmd); /* Accessed under genl lock */ static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX]; static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info) { u32 tunnel_id; u32 session_id; char *ifname; struct l2tp_tunnel *tunnel; struct l2tp_session *session = NULL; struct net *net = genl_info_net(info); if (info->attrs[L2TP_ATTR_IFNAME]) { ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); session = l2tp_session_get_by_ifname(net, ifname); } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) && (info->attrs[L2TP_ATTR_CONN_ID])) { tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (tunnel) { session = l2tp_session_get(net, tunnel->sock, tunnel->version, tunnel_id, session_id); l2tp_tunnel_put(tunnel); } } return session; } static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *hdr; int ret = -ENOBUFS; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto out; } hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &l2tp_nl_family, 0, L2TP_CMD_NOOP); if (!hdr) { ret = -EMSGSIZE; goto err_out; } genlmsg_end(msg, hdr); return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); err_out: nlmsg_free(msg); out: return ret; } static int l2tp_tunnel_notify(struct genl_family *family, struct genl_info *info, struct l2tp_tunnel *tunnel, u8 cmd) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, cmd); if (ret >= 0) { ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; return ret; } nlmsg_free(msg); return ret; } static int l2tp_session_notify(struct genl_family *family, struct genl_info *info, struct l2tp_session *session, u8 cmd) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, session, cmd); if (ret >= 0) { ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; return ret; } nlmsg_free(msg); return ret; } static int l2tp_nl_cmd_tunnel_create_get_addr(struct nlattr **attrs, struct l2tp_tunnel_cfg *cfg) { if (attrs[L2TP_ATTR_UDP_SPORT]) cfg->local_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_SPORT]); if (attrs[L2TP_ATTR_UDP_DPORT]) cfg->peer_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_DPORT]); cfg->use_udp_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_CSUM]); /* Must have either AF_INET or AF_INET6 address for source and destination */ #if IS_ENABLED(CONFIG_IPV6) if (attrs[L2TP_ATTR_IP6_SADDR] && attrs[L2TP_ATTR_IP6_DADDR]) { cfg->local_ip6 = nla_data(attrs[L2TP_ATTR_IP6_SADDR]); cfg->peer_ip6 = nla_data(attrs[L2TP_ATTR_IP6_DADDR]); cfg->udp6_zero_tx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); cfg->udp6_zero_rx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); return 0; } #endif if (attrs[L2TP_ATTR_IP_SADDR] && attrs[L2TP_ATTR_IP_DADDR]) { cfg->local_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_SADDR]); cfg->peer_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_DADDR]); return 0; } return -EINVAL; } static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info) { u32 tunnel_id; u32 peer_tunnel_id; int proto_version; int fd = -1; int ret = 0; struct l2tp_tunnel_cfg cfg = { 0, }; struct l2tp_tunnel *tunnel; struct net *net = genl_info_net(info); struct nlattr **attrs = info->attrs; if (!attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(attrs[L2TP_ATTR_CONN_ID]); if (!attrs[L2TP_ATTR_PEER_CONN_ID]) { ret = -EINVAL; goto out; } peer_tunnel_id = nla_get_u32(attrs[L2TP_ATTR_PEER_CONN_ID]); if (!attrs[L2TP_ATTR_PROTO_VERSION]) { ret = -EINVAL; goto out; } proto_version = nla_get_u8(attrs[L2TP_ATTR_PROTO_VERSION]); if (!attrs[L2TP_ATTR_ENCAP_TYPE]) { ret = -EINVAL; goto out; } cfg.encap = nla_get_u16(attrs[L2TP_ATTR_ENCAP_TYPE]); /* Managed tunnels take the tunnel socket from userspace. * Unmanaged tunnels must call out the source and destination addresses * for the kernel to create the tunnel socket itself. */ if (attrs[L2TP_ATTR_FD]) { fd = nla_get_u32(attrs[L2TP_ATTR_FD]); } else { ret = l2tp_nl_cmd_tunnel_create_get_addr(attrs, &cfg); if (ret < 0) goto out; } ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: case L2TP_ENCAPTYPE_IP: ret = l2tp_tunnel_create(fd, proto_version, tunnel_id, peer_tunnel_id, &cfg, &tunnel); break; } if (ret < 0) goto out; refcount_inc(&tunnel->ref_count); ret = l2tp_tunnel_register(tunnel, net, &cfg); if (ret < 0) { kfree(tunnel); goto out; } ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_CREATE); l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; u32 tunnel_id; int ret = 0; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_DELETE); l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; u32 tunnel_id; int ret = 0; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); l2tp_tunnel_put(tunnel); out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static int l2tp_nl_tunnel_send_addr6(struct sk_buff *skb, struct sock *sk, enum l2tp_encap_type encap) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); switch (encap) { case L2TP_ENCAPTYPE_UDP: if (udp_get_no_check6_tx(sk) && nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX)) return -1; if (udp_get_no_check6_rx(sk) && nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX)) return -1; if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) return -1; fallthrough; case L2TP_ENCAPTYPE_IP: if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, &np->saddr) || nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, &sk->sk_v6_daddr)) return -1; break; } return 0; } #endif static int l2tp_nl_tunnel_send_addr4(struct sk_buff *skb, struct sock *sk, enum l2tp_encap_type encap) { struct inet_sock *inet = inet_sk(sk); switch (encap) { case L2TP_ENCAPTYPE_UDP: if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx) || nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) return -1; fallthrough; case L2TP_ENCAPTYPE_IP: if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr) || nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr)) return -1; break; } return 0; } /* Append attributes for the tunnel address, handling the different attribute types * used for different tunnel encapsulation and AF_INET v.s. AF_INET6. */ static int l2tp_nl_tunnel_send_addr(struct sk_buff *skb, struct l2tp_tunnel *tunnel) { struct sock *sk = tunnel->sock; if (!sk) return 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return l2tp_nl_tunnel_send_addr6(skb, sk, tunnel->encap); #endif return l2tp_nl_tunnel_send_addr4(skb, sk, tunnel->encap); } static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_tunnel *tunnel, u8 cmd) { void *hdr; struct nlattr *nest; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) || nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, atomic_long_read(&tunnel->stats.tx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES, atomic_long_read(&tunnel->stats.tx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS, atomic_long_read(&tunnel->stats.tx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS, atomic_long_read(&tunnel->stats.rx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES, atomic_long_read(&tunnel->stats.rx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&tunnel->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, atomic_long_read(&tunnel->stats.rx_cookie_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&tunnel->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, atomic_long_read(&tunnel->stats.rx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID, atomic_long_read(&tunnel->stats.rx_invalid), L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); if (l2tp_nl_tunnel_send_addr(skb, tunnel)) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -1; } static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; struct sk_buff *msg; u32 tunnel_id; int ret = -ENOBUFS; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto err; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto err_nlmsg; } ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, L2TP_CMD_TUNNEL_GET); if (ret < 0) goto err_nlmsg_tunnel; l2tp_tunnel_put(tunnel); return genlmsg_unicast(net, msg, info->snd_portid); err_nlmsg_tunnel: l2tp_tunnel_put(tunnel); err_nlmsg: nlmsg_free(msg); err: return ret; } struct l2tp_nl_cb_data { unsigned long tkey; unsigned long skey; }; static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0]; unsigned long key = cbd->tkey; struct l2tp_tunnel *tunnel; struct net *net = sock_net(skb->sk); for (;;) { tunnel = l2tp_tunnel_get_next(net, &key); if (!tunnel) goto out; if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, tunnel, L2TP_CMD_TUNNEL_GET) < 0) { l2tp_tunnel_put(tunnel); goto out; } l2tp_tunnel_put(tunnel); key++; } out: cbd->tkey = key; return skb->len; } static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *info) { u32 tunnel_id = 0; u32 session_id; u32 peer_session_id; int ret = 0; struct l2tp_tunnel *tunnel; struct l2tp_session *session; struct l2tp_session_cfg cfg = { 0, }; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } if (!info->attrs[L2TP_ATTR_SESSION_ID]) { ret = -EINVAL; goto out_tunnel; } session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) { ret = -EINVAL; goto out_tunnel; } peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PW_TYPE]) { ret = -EINVAL; goto out_tunnel; } cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]); if (cfg.pw_type >= __L2TP_PWTYPE_MAX) { ret = -EINVAL; goto out_tunnel; } /* L2TPv2 only accepts PPP pseudo-wires */ if (tunnel->version == 2 && cfg.pw_type != L2TP_PWTYPE_PPP) { ret = -EPROTONOSUPPORT; goto out_tunnel; } if (tunnel->version > 2) { if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) { cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]); if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT && cfg.l2specific_type != L2TP_L2SPECTYPE_NONE) { ret = -EINVAL; goto out_tunnel; } } else { cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT; } if (info->attrs[L2TP_ATTR_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); if (len > 8) { ret = -EINVAL; goto out_tunnel; } cfg.cookie_len = len; memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len); } if (info->attrs[L2TP_ATTR_PEER_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); if (len > 8) { ret = -EINVAL; goto out_tunnel; } cfg.peer_cookie_len = len; memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len); } if (info->attrs[L2TP_ATTR_IFNAME]) cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); } if (info->attrs[L2TP_ATTR_RECV_SEQ]) cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); if (info->attrs[L2TP_ATTR_SEND_SEQ]) cfg.send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); if (info->attrs[L2TP_ATTR_LNS_MODE]) cfg.lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); #ifdef CONFIG_MODULES if (!l2tp_nl_cmd_ops[cfg.pw_type]) { genl_unlock(); request_module("net-l2tp-type-%u", cfg.pw_type); genl_lock(); } #endif if (!l2tp_nl_cmd_ops[cfg.pw_type] || !l2tp_nl_cmd_ops[cfg.pw_type]->session_create) { ret = -EPROTONOSUPPORT; goto out_tunnel; } ret = l2tp_nl_cmd_ops[cfg.pw_type]->session_create(net, tunnel, session_id, peer_session_id, &cfg); if (ret >= 0) { session = l2tp_session_get(net, tunnel->sock, tunnel->version, tunnel_id, session_id); if (session) { ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_CREATE); l2tp_session_put(session); } } out_tunnel: l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *info) { int ret = 0; struct l2tp_session *session; u16 pw_type; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto out; } l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_DELETE); pw_type = session->pwtype; if (pw_type < __L2TP_PWTYPE_MAX) if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) l2tp_nl_cmd_ops[pw_type]->session_delete(session); l2tp_session_put(session); out: return ret; } static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *info) { int ret = 0; struct l2tp_session *session; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto out; } if (info->attrs[L2TP_ATTR_RECV_SEQ]) session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); if (info->attrs[L2TP_ATTR_SEND_SEQ]) { struct l2tp_tunnel *tunnel = session->tunnel; session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); l2tp_session_set_header_len(session, tunnel->version, tunnel->encap); } if (info->attrs[L2TP_ATTR_LNS_MODE]) session->lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_MODIFY); l2tp_session_put(session); out: return ret; } static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_session *session, u8 cmd) { void *hdr; struct nlattr *nest; struct l2tp_tunnel *tunnel = session->tunnel; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; if ((session->ifname[0] && nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) || (session->cookie_len && nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, session->cookie)) || (session->peer_cookie_len && nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, session->peer_cookie)) || nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) || nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) || nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) || (l2tp_tunnel_uses_xfrm(tunnel) && nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) || (session->reorder_timeout && nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout, L2TP_ATTR_PAD))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, atomic_long_read(&session->stats.tx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES, atomic_long_read(&session->stats.tx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS, atomic_long_read(&session->stats.tx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS, atomic_long_read(&session->stats.rx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES, atomic_long_read(&session->stats.rx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&session->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, atomic_long_read(&session->stats.rx_cookie_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&session->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, atomic_long_read(&session->stats.rx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID, atomic_long_read(&session->stats.rx_invalid), L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -1; } static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) { struct l2tp_session *session; struct sk_buff *msg; int ret; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_ref; } ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, 0, session, L2TP_CMD_SESSION_GET); if (ret < 0) goto err_ref_msg; ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); l2tp_session_put(session); return ret; err_ref_msg: nlmsg_free(msg); err_ref: l2tp_session_put(session); err: return ret; } static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0]; struct net *net = sock_net(skb->sk); struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; unsigned long tkey = cbd->tkey; unsigned long skey = cbd->skey; for (;;) { if (!tunnel) { tunnel = l2tp_tunnel_get_next(net, &tkey); if (!tunnel) goto out; } session = l2tp_session_get_next(net, tunnel->sock, tunnel->version, tunnel->tunnel_id, &skey); if (!session) { tkey++; l2tp_tunnel_put(tunnel); tunnel = NULL; skey = 0; continue; } if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, session, L2TP_CMD_SESSION_GET) < 0) { l2tp_session_put(session); l2tp_tunnel_put(tunnel); break; } l2tp_session_put(session); skey++; } out: cbd->tkey = tkey; cbd->skey = skey; return skb->len; } static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { [L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, }, [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_OFFSET] = { .type = NLA_U16, }, [L2TP_ATTR_DATA_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_TYPE] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_LEN] = { .type = NLA_U8, }, [L2TP_ATTR_PROTO_VERSION] = { .type = NLA_U8, }, [L2TP_ATTR_CONN_ID] = { .type = NLA_U32, }, [L2TP_ATTR_PEER_CONN_ID] = { .type = NLA_U32, }, [L2TP_ATTR_SESSION_ID] = { .type = NLA_U32, }, [L2TP_ATTR_PEER_SESSION_ID] = { .type = NLA_U32, }, [L2TP_ATTR_UDP_CSUM] = { .type = NLA_U8, }, [L2TP_ATTR_VLAN_ID] = { .type = NLA_U16, }, [L2TP_ATTR_DEBUG] = { .type = NLA_U32, }, [L2TP_ATTR_RECV_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_SEND_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_LNS_MODE] = { .type = NLA_U8, }, [L2TP_ATTR_USING_IPSEC] = { .type = NLA_U8, }, [L2TP_ATTR_RECV_TIMEOUT] = { .type = NLA_MSECS, }, [L2TP_ATTR_FD] = { .type = NLA_U32, }, [L2TP_ATTR_IP_SADDR] = { .type = NLA_U32, }, [L2TP_ATTR_IP_DADDR] = { .type = NLA_U32, }, [L2TP_ATTR_UDP_SPORT] = { .type = NLA_U16, }, [L2TP_ATTR_UDP_DPORT] = { .type = NLA_U16, }, [L2TP_ATTR_MTU] = { .type = NLA_U16, }, [L2TP_ATTR_MRU] = { .type = NLA_U16, }, [L2TP_ATTR_STATS] = { .type = NLA_NESTED, }, [L2TP_ATTR_IP6_SADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, [L2TP_ATTR_IP6_DADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, [L2TP_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1, }, [L2TP_ATTR_COOKIE] = { .type = NLA_BINARY, .len = 8, }, [L2TP_ATTR_PEER_COOKIE] = { .type = NLA_BINARY, .len = 8, }, }; static const struct genl_small_ops l2tp_nl_ops[] = { { .cmd = L2TP_CMD_NOOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_noop, /* can be retrieved by unprivileged users */ }, { .cmd = L2TP_CMD_TUNNEL_CREATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_create, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_delete, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_MODIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_modify, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_get, .dumpit = l2tp_nl_cmd_tunnel_dump, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_CREATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_create, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_delete, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_MODIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_modify, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_get, .dumpit = l2tp_nl_cmd_session_dump, .flags = GENL_UNS_ADMIN_PERM, }, }; static struct genl_family l2tp_nl_family __ro_after_init = { .name = L2TP_GENL_NAME, .version = L2TP_GENL_VERSION, .hdrsize = 0, .maxattr = L2TP_ATTR_MAX, .policy = l2tp_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = l2tp_nl_ops, .n_small_ops = ARRAY_SIZE(l2tp_nl_ops), .resv_start_op = L2TP_CMD_SESSION_GET + 1, .mcgrps = l2tp_multicast_group, .n_mcgrps = ARRAY_SIZE(l2tp_multicast_group), }; int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops) { int ret; ret = -EINVAL; if (pw_type >= __L2TP_PWTYPE_MAX) goto err; genl_lock(); ret = -EBUSY; if (l2tp_nl_cmd_ops[pw_type]) goto out; l2tp_nl_cmd_ops[pw_type] = ops; ret = 0; out: genl_unlock(); err: return ret; } EXPORT_SYMBOL_GPL(l2tp_nl_register_ops); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type) { if (pw_type < __L2TP_PWTYPE_MAX) { genl_lock(); l2tp_nl_cmd_ops[pw_type] = NULL; genl_unlock(); } } EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops); static int __init l2tp_nl_init(void) { pr_info("L2TP netlink interface\n"); return genl_register_family(&l2tp_nl_family); } static void l2tp_nl_cleanup(void) { genl_unregister_family(&l2tp_nl_family); } module_init(l2tp_nl_init); module_exit(l2tp_nl_cleanup); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP netlink"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.0"); MODULE_ALIAS_GENL_FAMILY("l2tp");
1 59 6 7 300 8 301 173 13 14 38 40 12 12 12 3 33 436 435 299 301 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_SCHED_GENERIC_H #define __NET_SCHED_GENERIC_H #include <linux/netdevice.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/pkt_sched.h> #include <linux/pkt_cls.h> #include <linux/percpu.h> #include <linux/dynamic_queue_limits.h> #include <linux/list.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include <linux/hashtable.h> #include <net/gen_stats.h> #include <net/rtnetlink.h> #include <net/flow_offload.h> #include <linux/xarray.h> struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; struct bpf_flow_keys; struct qdisc_rate_table { struct tc_ratespec rate; u32 data[256]; struct qdisc_rate_table *next; int refcnt; }; enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, __QDISC_STATE_MISSED, __QDISC_STATE_DRAINING, }; enum qdisc_state2_t { /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. * Use qdisc_run_begin/end() or qdisc_is_running() instead. */ __QDISC_STATE2_RUNNING, }; #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) #define QDISC_STATE_NON_EMPTY (QDISC_STATE_MISSED | \ QDISC_STATE_DRAINING) struct qdisc_size_table { struct rcu_head rcu; struct list_head list; struct tc_sizespec szopts; int refcnt; u16 data[]; }; /* similar to sk_buff_head, but skb->prev pointer is undefined. */ struct qdisc_skb_head { struct sk_buff *head; struct sk_buff *tail; __u32 qlen; spinlock_t lock; }; struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *sch); unsigned int flags; #define TCQ_F_BUILTIN 1 #define TCQ_F_INGRESS 2 #define TCQ_F_CAN_BYPASS 4 #define TCQ_F_MQROOT 8 #define TCQ_F_ONETXQUEUE 0x10 /* dequeue_skb() can assume all skbs are for * q->dev_queue : It can test * netif_xmit_frozen_or_stopped() before * dequeueing next packet. * Its true for MQ/MQPRIO slaves, or non * multiqueue device. */ #define TCQ_F_WARN_NONWC (1 << 16) #define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */ #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; struct hlist_node hash; u32 handle; u32 parent; struct netdev_queue *dev_queue; struct net_rate_estimator __rcu *rate_est; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; int pad; refcount_t refcnt; /* * For performance sake on SMP, we put highly modified fields at the end */ struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; struct rcu_head rcu; netdevice_tracker dev_tracker; struct lock_class_key root_lock_key; /* private data */ long privdata[] ____cacheline_aligned; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; refcount_inc(&qdisc->refcnt); } static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return true; return refcount_dec_if_one(&qdisc->refcnt); } /* Intended to be used by unlocked users, when concurrent qdisc release is * possible. */ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return qdisc; if (refcount_inc_not_zero(&qdisc->refcnt)) return qdisc; return NULL; } /* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc * root_lock section, or provide their own memory barriers -- ordering * against qdisc_run_begin/end() atomic bit operations. */ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) { return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY); } static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) { return q->flags & TCQ_F_CPUSTATS; } static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) return nolock_qdisc_is_empty(qdisc); return !READ_ONCE(qdisc->q.qlen); } /* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with * the qdisc root lock acquired. */ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { if (spin_trylock(&qdisc->seqlock)) return true; /* No need to insist if the MISSED flag was already set. * Note that test_and_set_bit() also gives us memory ordering * guarantees wrt potential earlier enqueue() and below * spin_trylock(), both of which are necessary to prevent races */ if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state)) return false; /* Try to take the lock again to make sure that we will either * grab it or the CPU that still has it will see MISSED set * when testing it in qdisc_run_end() */ return spin_trylock(&qdisc->seqlock); } return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline void qdisc_run_end(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); /* spin_unlock() only has store-release semantic. The unlock * and test_bit() ordering is a store-load ordering, so a full * memory barrier is needed here. */ smp_mb(); if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); } else { __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) { return qdisc->flags & TCQ_F_ONETXQUEUE; } static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) { return netdev_queue_dql_avail(txq); } struct Qdisc_class_ops { unsigned int flags; /* Child qdisc manipulation */ struct netdev_queue * (*select_queue)(struct Qdisc *, struct tcmsg *); int (*graft)(struct Qdisc *, unsigned long cl, struct Qdisc *, struct Qdisc **, struct netlink_ext_ack *extack); struct Qdisc * (*leaf)(struct Qdisc *, unsigned long cl); void (*qlen_notify)(struct Qdisc *, unsigned long); /* Class manipulation routines */ unsigned long (*find)(struct Qdisc *, u32 classid); int (*change)(struct Qdisc *, u32, u32, struct nlattr **, unsigned long *, struct netlink_ext_ack *); int (*delete)(struct Qdisc *, unsigned long, struct netlink_ext_ack *); void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ struct tcf_block * (*tcf_block)(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); void (*unbind_tcf)(struct Qdisc *, unsigned long); /* rtnetlink specific */ int (*dump)(struct Qdisc *, unsigned long, struct sk_buff *skb, struct tcmsg*); int (*dump_stats)(struct Qdisc *, unsigned long, struct gnet_dump *); }; /* Qdisc_class_ops flag values */ /* Implements API that doesn't require rtnl lock */ enum qdisc_class_ops_flags { QDISC_CLASS_OPS_DOIT_UNLOCKED = 1, }; struct Qdisc_ops { struct Qdisc_ops *next; const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); int (*init)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack); void (*reset)(struct Qdisc *); void (*destroy)(struct Qdisc *); int (*change)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack); void (*attach)(struct Qdisc *sch); int (*change_tx_queue_len)(struct Qdisc *, unsigned int); void (*change_real_num_tx)(struct Qdisc *sch, unsigned int new_real_tx); int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); void (*ingress_block_set)(struct Qdisc *sch, u32 block_index); void (*egress_block_set)(struct Qdisc *sch, u32 block_index); u32 (*ingress_block_get)(struct Qdisc *sch); u32 (*egress_block_get)(struct Qdisc *sch); struct module *owner; }; struct tcf_result { union { struct { unsigned long class; u32 classid; }; const struct tcf_proto *goto_tp; }; }; struct tcf_chain; struct tcf_proto_ops { struct list_head head; char kind[IFNAMSIZ]; int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); int (*init)(struct tcf_proto*); void (*destroy)(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack); void* (*get)(struct tcf_proto*, u32 handle); void (*put)(struct tcf_proto *tp, void *f); int (*change)(struct net *net, struct sk_buff *, struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, void **, u32, struct netlink_ext_ack *); int (*delete)(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *); bool (*delete_empty)(struct tcf_proto *tp); void (*walk)(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held); int (*reoffload)(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*hw_add)(struct tcf_proto *tp, void *type_data); void (*hw_del)(struct tcf_proto *tp, void *type_data); void (*bind_class)(void *, u32, unsigned long, void *, unsigned long); void * (*tmplt_create)(struct net *net, struct tcf_chain *chain, struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); void (*tmplt_reoffload)(struct tcf_chain *chain, bool add, flow_setup_cb_t *cb, void *cb_priv); struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, u32 handle); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*, bool); int (*terse_dump)(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held); int (*tmplt_dump)(struct sk_buff *skb, struct net *net, void *tmplt_priv); struct module *owner; int flags; }; /* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags * are expected to implement tcf_proto_ops->delete_empty(), otherwise race * conditions can occur when filters are inserted/deleted simultaneously. */ enum tcf_proto_ops_flags { TCF_PROTO_OPS_DOIT_UNLOCKED = 1, }; struct tcf_proto { /* Fast access part */ struct tcf_proto __rcu *next; void __rcu *root; /* called under RCU BH lock*/ int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); __be16 protocol; /* All the rest */ u32 prio; void *data; const struct tcf_proto_ops *ops; struct tcf_chain *chain; /* Lock protects tcf_proto shared state and can be used by unlocked * classifiers to protect their private data. */ spinlock_t lock; bool deleting; bool counted; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; }; struct qdisc_skb_cb { struct { unsigned int pkt_len; u16 slave_dev_queue_mapping; u16 tc_classid; }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { /* Protects filter_chain. */ struct mutex filter_chain_lock; struct tcf_proto __rcu *filter_chain; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; unsigned int action_refcnt; bool explicitly_created; bool flushing; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; struct rcu_head rcu; }; struct tcf_block { struct xarray ports; /* datapath accessible */ /* Lock protects tcf_block and lifetime-management data of chains * attached to the block (refcnt, action_refcnt, explicitly_created). */ struct mutex lock; struct list_head chain_list; u32 index; /* block index for shared blocks */ u32 classid; /* which class this block belongs to */ refcount_t refcnt; struct net *net; struct Qdisc *q; struct rw_semaphore cb_lock; /* protects cb_list and offload counters */ struct flow_block flow_block; struct list_head owner_list; bool keep_dst; bool bypass_wanted; atomic_t filtercnt; /* Number of filters */ atomic_t skipswcnt; /* Number of skip_sw filters */ atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ struct { struct tcf_chain *chain; struct list_head filter_chain_list; } chain0; struct rcu_head rcu; DECLARE_HASHTABLE(proto_destroy_ht, 7); struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index); static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain) { return lockdep_is_held(&chain->filter_chain_lock); } static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp) { return lockdep_is_held(&tp->lock); } #define tcf_chain_dereference(p, chain) \ rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain)) #define tcf_proto_dereference(p, tp) \ rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp)) static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb)); BUILD_BUG_ON(sizeof(qcb->data) < sz); } static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; } static inline int qdisc_qlen_sum(const struct Qdisc *q) { __u32 qlen = q->qstats.qlen; int i; if (qdisc_is_percpu_stats(q)) { for_each_possible_cpu(i) qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; } else { qlen += q->q.qlen; } return qlen; } static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) { return (struct qdisc_skb_cb *)skb->cb; } static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc) { return &qdisc->q.lock; } static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) { struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc); return q; } static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc) { return rcu_dereference_bh(qdisc->dev_queue->qdisc); } static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) { return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping); } static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) { struct Qdisc *root = qdisc_root_sleeping(qdisc); ASSERT_RTNL(); return qdisc_lock(root); } static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) { return qdisc->dev_queue->dev; } static inline void sch_tree_lock(struct Qdisc *q) { if (q->flags & TCQ_F_MQROOT) spin_lock_bh(qdisc_lock(q)); else spin_lock_bh(qdisc_root_sleeping_lock(q)); } static inline void sch_tree_unlock(struct Qdisc *q) { if (q->flags & TCQ_F_MQROOT) spin_unlock_bh(qdisc_lock(q)); else spin_unlock_bh(qdisc_root_sleeping_lock(q)); } extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1]; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; static inline const struct Qdisc_ops * get_default_qdisc_ops(const struct net_device *dev, int ntx) { return ntx < dev->real_num_tx_queues ? default_qdisc_ops : &pfifo_fast_ops; } struct Qdisc_class_common { u32 classid; unsigned int filter_cnt; struct hlist_node hnode; }; struct Qdisc_class_hash { struct hlist_head *hash; unsigned int hashsize; unsigned int hashmask; unsigned int hashelems; }; static inline unsigned int qdisc_class_hash(u32 id, u32 mask) { id ^= id >> 8; id ^= id >> 4; return id & mask; } static inline struct Qdisc_class_common * qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) { struct Qdisc_class_common *cl; unsigned int h; if (!id) return NULL; h = qdisc_class_hash(id, hash->hashmask); hlist_for_each_entry(cl, &hash->hash[h], hnode) { if (cl->classid == id) return cl; } return NULL; } static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl) { return cl->filter_cnt > 0; } static inline void qdisc_class_get(struct Qdisc_class_common *cl) { unsigned int res; if (check_add_overflow(cl->filter_cnt, 1, &res)) WARN(1, "Qdisc class overflow"); cl->filter_cnt = res; } static inline void qdisc_class_put(struct Qdisc_class_common *cl) { unsigned int res; if (check_sub_overflow(cl->filter_cnt, 1, &res)) WARN(1, "Qdisc class underflow"); cl->filter_cnt = res; } static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) { u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL; } int qdisc_class_hash_init(struct Qdisc_class_hash *); void qdisc_class_hash_insert(struct Qdisc_class_hash *, struct Qdisc_class_common *); void qdisc_class_hash_remove(struct Qdisc_class_hash *, struct Qdisc_class_common *); void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); void qdisc_class_hash_destroy(struct Qdisc_class_hash *); int dev_qdisc_change_tx_queue_len(struct net_device *dev); void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx); void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); void dev_deactivate(struct net_device *dev); void dev_deactivate_many(struct list_head *head); struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); void qdisc_destroy(struct Qdisc *qdisc); void qdisc_put(struct Qdisc *qdisc); void qdisc_put_unlocked(struct Qdisc *qdisc); void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len); #ifdef CONFIG_NET_SCHED int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data); void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack); #else static inline int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data) { q->flags &= ~TCQ_F_OFFLOADED; return 0; } static inline void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack) { } #endif void qdisc_offload_query_caps(struct net_device *dev, enum tc_setup_type type, void *caps, size_t caps_len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack); void qdisc_free(struct Qdisc *qdisc); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, u32 parentid, struct netlink_ext_ack *extack); void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); int skb_do_redirect(struct sk_buff *); static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_XGRESS return skb->tc_at_ingress; #else return false; #endif } static inline bool skb_skip_tc_classify(struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT if (skb->tc_skip_classify) { skb->tc_skip_classify = 0; return true; } #endif return false; } /* Reset all TX qdiscs greater than index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) { struct Qdisc *qdisc; for (; i < dev->num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } } } /* Are all TX queues of the device empty? */ static inline bool qdisc_all_tx_empty(const struct net_device *dev) { unsigned int i; rcu_read_lock(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); const struct Qdisc *q = rcu_dereference(txq->qdisc); if (!qdisc_is_empty(q)) { rcu_read_unlock(); return false; } } rcu_read_unlock(); return true; } /* Are any of the TX qdiscs changing? */ static inline bool qdisc_tx_changing(const struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (rcu_access_pointer(txq->qdisc) != rcu_access_pointer(txq->qdisc_sleeping)) return true; } return false; } /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (rcu_access_pointer(txq->qdisc) != &noop_qdisc) return false; } return true; } static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) { return qdisc_skb_cb(skb)->pkt_len; } /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, __NET_XMIT_BYPASS = 0x00020000, }; #ifdef CONFIG_NET_CLS_ACT #define net_xmit_drop_count(e) ((e) & __NET_XMIT_STOLEN ? 0 : 1) #else #define net_xmit_drop_count(e) (1) #endif static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, const struct Qdisc *sch) { #ifdef CONFIG_NET_SCHED struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab); if (stab) __qdisc_calculate_pkt_len(skb, stab); #endif } static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { return sch->enqueue(skb, sch, to_free); } static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, __u64 bytes, __u32 packets) { u64_stats_update_begin(&bstats->syncp); u64_stats_add(&bstats->bytes, bytes); u64_stats_add(&bstats->packets, packets); u64_stats_update_end(&bstats->syncp); } static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { _bstats_update(bstats, qdisc_pkt_len(skb), skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, const struct sk_buff *skb) { bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); } static inline void qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) { bstats_update(&sch->bstats, skb); } static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog -= qdisc_pkt_len(skb); } static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch, const struct sk_buff *skb) { this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog += qdisc_pkt_len(skb); } static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) { this_cpu_dec(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->requeues); } static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) { sch->qstats.drops += count; } static inline void qstats_drop_inc(struct gnet_stats_queue *qstats) { qstats->drops++; } static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats) { qstats->overlimits++; } static inline void qdisc_qstats_drop(struct Qdisc *sch) { qstats_drop_inc(&sch->qstats); } static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->drops); } static inline void qdisc_qstats_overlimit(struct Qdisc *sch) { sch->qstats.overlimits++; } static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch) { __u32 qlen = qdisc_qlen_sum(sch); return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen); } static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, __u32 *backlog) { struct gnet_stats_queue qstats = { 0 }; gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); *qlen = qstats.qlen + qdisc_qlen(sch); *backlog = qstats.backlog; } static inline void qdisc_tree_flush_backlog(struct Qdisc *sch) { __u32 qlen, backlog; qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); qdisc_tree_reduce_backlog(sch, qlen, backlog); } static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); qdisc_reset(sch); qdisc_tree_reduce_backlog(sch, qlen, backlog); } static inline void __qdisc_enqueue_tail(struct sk_buff *skb, struct qdisc_skb_head *qh) { struct sk_buff *last = qh->tail; if (last) { skb->next = NULL; last->next = skb; qh->tail = skb; } else { qh->tail = skb; qh->head = skb; } qh->qlen++; } static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch) { __qdisc_enqueue_tail(skb, &sch->q); qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } static inline void __qdisc_enqueue_head(struct sk_buff *skb, struct qdisc_skb_head *qh) { skb->next = qh->head; if (!qh->head) qh->tail = skb; qh->head = skb; qh->qlen++; } static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) { struct sk_buff *skb = qh->head; if (likely(skb != NULL)) { qh->head = skb->next; qh->qlen--; if (qh->head == NULL) qh->tail = NULL; skb->next = NULL; } return skb; } static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); if (likely(skb != NULL)) { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); } return skb; } struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; u16 zone; /* Only valid if post_ct = true */ u16 mru; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) { struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); return cb; } static inline enum skb_drop_reason tcf_get_drop_reason(const struct sk_buff *skb) { return tc_skb_cb(skb)->drop_reason; } static inline void tcf_set_drop_reason(const struct sk_buff *skb, enum skb_drop_reason reason) { tc_skb_cb(skb)->drop_reason = reason; } /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free) { skb->next = *to_free; *to_free = skb; } static inline void __qdisc_drop_all(struct sk_buff *skb, struct sk_buff **to_free) { if (skb->prev) skb->prev->next = *to_free; else skb->next = *to_free; *to_free = skb; } static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, struct qdisc_skb_head *qh, struct sk_buff **to_free) { struct sk_buff *skb = __qdisc_dequeue_head(qh); if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); __qdisc_drop(skb, to_free); return len; } return 0; } static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) { const struct qdisc_skb_head *qh = &sch->q; return qh->head; } /* generic pseudo peek method for non-work-conserving qdisc */ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) { struct sk_buff *skb = skb_peek(&sch->gso_skb); /* we can reuse ->gso_skb because peek isn't called for root qdiscs */ if (!skb) { skb = sch->dequeue(sch); if (skb) { __skb_queue_head(&sch->gso_skb, skb); /* it's still part of the queue */ qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; } } return skb; } static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch, struct sk_buff *skb) { if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_bstats_cpu_update(sch, skb); qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); sch->q.qlen--; } } static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch, unsigned int pkt_len) { if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_qlen_inc(sch); this_cpu_add(sch->cpu_qstats->backlog, pkt_len); } else { sch->qstats.backlog += pkt_len; sch->q.qlen++; } } /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { struct sk_buff *skb = skb_peek(&sch->gso_skb); if (skb) { skb = __skb_dequeue(&sch->gso_skb); if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } } else { skb = sch->dequeue(sch); } return skb; } static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh) { /* * We do not know the backlog in bytes of this list, it * is up to the caller to correct it */ ASSERT_RTNL(); if (qh->qlen) { rtnl_kfree_skbs(qh->head, qh->tail); qh->head = NULL; qh->tail = NULL; qh->qlen = 0; } } static inline void qdisc_reset_queue(struct Qdisc *sch) { __qdisc_reset_queue(&sch->q); } static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, struct Qdisc **pold) { struct Qdisc *old; sch_tree_lock(sch); old = *pold; *pold = new; if (old != NULL) qdisc_purge_queue(old); sch_tree_unlock(sch); return old; } static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { rtnl_kfree_skbs(skb, skb); qdisc_qstats_drop(sch); } static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); qdisc_qstats_cpu_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop_all(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; } struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; u16 mpu; u8 linklayer; u8 shift; }; static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, unsigned int len) { len += r->overhead; if (len < r->mpu) len = r->mpu; if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; return ((u64)len * r->mult) >> r->shift; } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64); static inline void psched_ratecfg_getrate(struct tc_ratespec *res, const struct psched_ratecfg *r) { memset(res, 0, sizeof(*res)); /* legacy struct tc_ratespec has a 32bit @rate field * Qdisc using 64bit rate should add new attributes * in order to maintain compatibility. */ res->rate = min_t(u64, r->rate_bytes_ps, ~0U); res->overhead = r->overhead; res->mpu = r->mpu; res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } struct psched_pktrate { u64 rate_pkts_ps; /* packets per second */ u32 mult; u8 shift; }; static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r, unsigned int pkt_num) { return ((u64)pkt_num * r->mult) >> r->shift; } void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. * The fast path only needs to access filter list and to update stats */ struct mini_Qdisc { struct tcf_proto *filter_list; struct tcf_block *block; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; unsigned long rcu_state; }; static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, const struct sk_buff *skb) { bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); } static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) { this_cpu_inc(miniq->cpu_qstats->drops); } struct mini_Qdisc_pair { struct mini_Qdisc miniq1; struct mini_Qdisc miniq2; struct mini_Qdisc __rcu **p_miniq; }; void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block); void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx); int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); /* Make sure qdisc is no longer in SCHED state. */ static inline void qdisc_synchronize(const struct Qdisc *q) { while (test_bit(__QDISC_STATE_SCHED, &q->state)) msleep(1); } #endif
19 19 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0-or-later /* Socket buffer accounting * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" #define select_skb_count(skb) (&rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. */ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_read(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } } /* * Note the addition of a ref on a socket buffer. */ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); skb_get(skb); } /* * Note the dropping of a ref on a socket buffer by the core. */ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(&rxrpc_n_rx_skbs); trace_rxrpc_skb(skb, 0, n, why); } /* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); consume_skb(skb); } } /* * Clear a queue of socket buffers. */ void rxrpc_purge_queue(struct sk_buff_head *list) { struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, rxrpc_skb_put_purge); consume_skb(skb); } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_CLS_H #define __NET_PKT_CLS_H #include <linux/pkt_cls.h> #include <linux/workqueue.h> #include <net/sch_generic.h> #include <net/act_api.h> #include <net/net_namespace.h> /* TC action not accessible from user space */ #define TC_ACT_CONSUMED (TC_ACT_VALUE_MAX + 1) /* Basic packet classifier frontend definitions. */ struct tcf_walker { int stop; int skip; int count; bool nonempty; unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #define NET_CLS_ALIAS_PREFIX "net-cls-" #define MODULE_ALIAS_NET_CLS(kind) MODULE_ALIAS(NET_CLS_ALIAS_PREFIX kind) struct tcf_block_ext_info { enum flow_block_binder_type binder_type; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; u32 block_index; }; struct tcf_qevent { struct tcf_block *block; struct tcf_block_ext_info info; struct tcf_proto __rcu *filter_chain; }; struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); void tcf_chain_put_by_act(struct tcf_chain *chain); struct tcf_chain *tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain); struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack); int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack); void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool used_action_miss); static inline bool tcf_block_shared(struct tcf_block *block) { return block->index; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return block && block->index; } #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); static inline bool tcf_block_bypass_sw(struct tcf_block *block) { return block && block->bypass_wanted; } #endif static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); return block->q; } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); static inline bool tc_cls_stats_dump(struct tcf_proto *tp, struct tcf_walker *arg, void *filter) { if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #else static inline bool tcf_block_shared(struct tcf_block *block) { return false; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return false; } static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { return 0; } static inline int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_block_put(struct tcf_block *block) { } static inline void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; } static inline int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { return TC_ACT_UNSPEC; } #endif static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { return xchg(clp, cl); } static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { unsigned long cl; cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); cl = __cls_set_class(&r->class, cl); if (cl) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; sch_tree_lock(q); __tcf_bind_filter(q, r, base); sch_tree_unlock(q); } static inline void __tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) { unsigned long cl; if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; if (!q) return; __tcf_unbind_filter(q, r); } static inline void tc_cls_bind_class(u32 classid, unsigned long cl, void *q, struct tcf_result *res, unsigned long base) { if (res->classid == classid) { if (cl) __tcf_bind_filter(q, res, base); else __tcf_unbind_filter(q, res); } } struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; struct net *net; netns_tracker ns_tracker; struct tcf_exts_miss_cookie_node *miss_cookie_node; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */ int action; int police; }; static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { #ifdef CONFIG_NET_CLS return tcf_exts_init_ex(exts, net, action, police, NULL, 0, false); #else return -EOPNOTSUPP; #endif } /* Return false if the netns is being destroyed in cleanup_net(). Callers * need to do cleanup synchronously in this case, otherwise may race with * tc_action_net_exit(). Return true for other cases. */ static inline bool tcf_exts_get_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT exts->net = maybe_get_net(exts->net); if (exts->net) netns_tracker_alloc(exts->net, &exts->ns_tracker, GFP_KERNEL); return exts->net != NULL; #else return true; #endif } static inline void tcf_exts_put_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->net) put_net_track(exts->net, &exts->ns_tracker); #endif } #ifdef CONFIG_NET_CLS_ACT #define tcf_exts_for_each_action(i, a, exts) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = (exts)->actions[i]); i++) #else #define tcf_exts_for_each_action(i, a, exts) \ for (; 0; (void)(i), (void)(a), (void)(exts)) #endif #define tcf_act_for_each_action(i, a, actions) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) static inline bool tc_act_in_hw(struct tc_action *act) { return !!act->in_hw_count; } static inline void tcf_exts_hw_stats_update(const struct tcf_exts *exts, struct flow_stats *stats, bool use_act_stats) { #ifdef CONFIG_NET_CLS_ACT int i; for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; if (use_act_stats || tc_act_in_hw(a)) { if (!tcf_action_update_hw_stats(a)) continue; } preempt_disable(); tcf_action_stats_update(a, stats->bytes, stats->pkts, stats->drops, stats->lastused, true); preempt_enable(); a->used_hw_stats = stats->used_hw_stats; a->used_hw_stats_valid = stats->used_hw_stats_valid; } #endif } /** * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * * Returns true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT return exts->nr_actions; #else return false; #endif } /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer * @exts: tc filter extensions handle * @res: desired result * * Executes all configured extensions. Returns TC_ACT_OK on a normal execution, * a negative number if the filter must be considered unmatched or * a positive action code (TC_ACT_*) which must be returned to the * underlying layer. */ static inline int tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions, exts->nr_actions, res); #endif return TC_ACT_OK; } static inline int tcf_exts_exec_ex(struct sk_buff *skb, struct tcf_exts *exts, int act_index, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions + act_index, exts->nr_actions - act_index, res); #else return TC_ACT_OK; #endif } int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** * struct tcf_pkt_info - packet information * * @ptr: start of the pkt data * @nexthdr: offset of the next header */ struct tcf_pkt_info { unsigned char * ptr; int nexthdr; }; #ifdef CONFIG_NET_EMATCH struct tcf_ematch_ops; /** * struct tcf_ematch - extended match (ematch) * * @matchid: identifier to allow userspace to reidentify a match * @flags: flags specifying attributes and the relation to other matches * @ops: the operations lookup table of the corresponding ematch module * @datalen: length of the ematch specific configuration data * @data: ematch specific data * @net: the network namespace */ struct tcf_ematch { struct tcf_ematch_ops * ops; unsigned long data; unsigned int datalen; u16 matchid; u16 flags; struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) { return !em->ops; } static inline int tcf_em_is_simple(struct tcf_ematch *em) { return em->flags & TCF_EM_SIMPLE; } static inline int tcf_em_is_inverted(struct tcf_ematch *em) { return em->flags & TCF_EM_INVERT; } static inline int tcf_em_last_match(struct tcf_ematch *em) { return (em->flags & TCF_EM_REL_MASK) == TCF_EM_REL_END; } static inline int tcf_em_early_end(struct tcf_ematch *em, int result) { if (tcf_em_last_match(em)) return 1; if (result == 0 && em->flags & TCF_EM_REL_AND) return 1; if (result != 0 && em->flags & TCF_EM_REL_OR) return 1; return 0; } /** * struct tcf_ematch_tree - ematch tree handle * * @hdr: ematch tree header supplied by userspace * @matches: array of ematches */ struct tcf_ematch_tree { struct tcf_ematch_tree_hdr hdr; struct tcf_ematch * matches; }; /** * struct tcf_ematch_ops - ematch module operations * * @kind: identifier (kind) of this ematch module * @datalen: length of expected configuration data (optional) * @change: called during validation (optional) * @match: called during ematch tree evaluation, must return 1/0 * @destroy: called during destroyage (optional) * @dump: called during dumping process (optional) * @owner: owner, must be set to THIS_MODULE * @link: link to previous/next ematch module (internal use) */ struct tcf_ematch_ops { int kind; int datalen; int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; }; int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** * tcf_em_tree_match - evaluate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation * @info: packet information examined by classifier * * This function matches @skb against the ematch tree in @tree by going * through all ematches respecting their logic relations returning * as soon as the result is obvious. * * Returns 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { if (tree->hdr.nmatches) return __tcf_em_tree_match(skb, tree, info); else return 1; } #define MODULE_ALIAS_TCF_EMATCH(kind) MODULE_ALIAS("ematch-kind-" __stringify(kind)) #else /* CONFIG_NET_EMATCH */ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) #define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) #endif /* CONFIG_NET_EMATCH */ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: return skb_transport_header(skb); } return NULL; } static inline int tcf_valid_offset(const struct sk_buff *skb, const unsigned char *ptr, const int len) { return likely((ptr + len) <= skb_tail_pointer(skb) && ptr >= skb->head && (ptr <= (ptr + len))); } static inline int tcf_change_indev(struct net *net, struct nlattr *indev_tlv, struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; } dev = __dev_get_by_name(net, indev); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Network device not found"); return -ENODEV; } return dev->ifindex; } static inline bool tcf_match_indev(struct sk_buff *skb, int ifindex) { if (!ifindex) return true; if (!skb->skb_iif) return false; return ifindex == skb->skb_iif; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack); void tc_cleanup_offload_action(struct flow_action *flow_action); int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held); int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count); unsigned int tcf_exts_num_actions(struct tcf_exts *exts); #ifdef CONFIG_NET_CLS_ACT int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe); #else static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { } static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline struct sk_buff * tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { return skb; } static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { return 0; } #endif struct tc_cls_u32_knode { struct tcf_exts *exts; struct tcf_result *res; struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; u8 fshift; }; struct tc_cls_u32_hnode { u32 handle; u32 prio; unsigned int divisor; }; enum tc_clsu32_command { TC_CLSU32_NEW_KNODE, TC_CLSU32_REPLACE_KNODE, TC_CLSU32_DELETE_KNODE, TC_CLSU32_NEW_HNODE, TC_CLSU32_REPLACE_HNODE, TC_CLSU32_DELETE_HNODE, }; struct tc_cls_u32_offload { struct flow_cls_common_offload common; /* knode values */ enum tc_clsu32_command command; union { struct tc_cls_u32_knode knode; struct tc_cls_u32_hnode hnode; }; }; static inline bool tc_can_offload(const struct net_device *dev) { return dev->features & NETIF_F_HW_TC; } static inline bool tc_can_offload_extack(const struct net_device *dev, struct netlink_ext_ack *extack) { bool can = tc_can_offload(dev); if (!can) NL_SET_ERR_MSG(extack, "TC offload is disabled on net device"); return can; } static inline bool tc_cls_can_offload_and_chain0(const struct net_device *dev, struct flow_cls_common_offload *common) { if (!tc_can_offload_extack(dev, common->extack)) return false; if (common->chain_index) { NL_SET_ERR_MSG(common->extack, "Driver supports only offload of chain 0"); return false; } return true; } static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | TCA_CLS_FLAGS_VERBOSE)) return false; flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; return true; } static inline bool tc_in_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false; } static inline void tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; cls_common->skip_sw = tc_skip_sw(flags); if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); if (tc_skb_ext) memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); return tc_skb_ext; } #endif enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct flow_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; struct flow_stats stats; bool use_act_stats; unsigned long cookie; }; enum tc_clsbpf_command { TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; struct tc_cls_bpf_offload { struct flow_cls_common_offload common; enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; struct bpf_prog *oldprog; const char *name; bool exts_integrated; }; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers */ struct tc_cookie { u8 *data; u32 len; struct rcu_head rcu; }; struct tc_qopt_offload_stats { struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; enum tc_mq_command { TC_MQ_CREATE, TC_MQ_DESTROY, TC_MQ_STATS, TC_MQ_GRAFT, }; struct tc_mq_opt_offload_graft_params { unsigned long queue; u32 child_handle; }; struct tc_mq_qopt_offload { enum tc_mq_command command; u32 handle; union { struct tc_qopt_offload_stats stats; struct tc_mq_opt_offload_graft_params graft_params; }; }; enum tc_htb_command { /* Root */ TC_HTB_CREATE, /* Initialize HTB offload. */ TC_HTB_DESTROY, /* Destroy HTB offload. */ /* Classes */ /* Allocate qid and create leaf. */ TC_HTB_LEAF_ALLOC_QUEUE, /* Convert leaf to inner, preserve and return qid, create new leaf. */ TC_HTB_LEAF_TO_INNER, /* Delete leaf, while siblings remain. */ TC_HTB_LEAF_DEL, /* Delete leaf, convert parent to leaf, preserving qid. */ TC_HTB_LEAF_DEL_LAST, /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ TC_HTB_LEAF_DEL_LAST_FORCE, /* Modify parameters of a node. */ TC_HTB_NODE_MODIFY, /* Class qdisc */ TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ }; struct tc_htb_qopt_offload { struct netlink_ext_ack *extack; enum tc_htb_command command; u32 parent_classid; u16 classid; u16 qid; u32 quantum; u64 rate; u64 ceil; u8 prio; }; #define TC_HTB_CLASSID_ROOT U32_MAX enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, TC_RED_STATS, TC_RED_XSTATS, TC_RED_GRAFT, }; struct tc_red_qopt_offload_params { u32 min; u32 max; u32 probability; u32 limit; bool is_ecn; bool is_harddrop; bool is_nodrop; struct gnet_stats_queue *qstats; }; struct tc_red_qopt_offload { enum tc_red_command command; u32 handle; u32 parent; union { struct tc_red_qopt_offload_params set; struct tc_qopt_offload_stats stats; struct red_stats *xstats; u32 child_handle; }; }; enum tc_gred_command { TC_GRED_REPLACE, TC_GRED_DESTROY, TC_GRED_STATS, }; struct tc_gred_vq_qopt_offload_params { bool present; u32 limit; u32 prio; u32 min; u32 max; bool is_ecn; bool is_harddrop; u32 probability; /* Only need backlog, see struct tc_prio_qopt_offload_params */ u32 *backlog; }; struct tc_gred_qopt_offload_params { bool grio_on; bool wred_on; unsigned int dp_cnt; unsigned int dp_def; struct gnet_stats_queue *qstats; struct tc_gred_vq_qopt_offload_params tab[MAX_DPs]; }; struct tc_gred_qopt_offload_stats { struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; struct tc_gred_qopt_offload { enum tc_gred_command command; u32 handle; u32 parent; union { struct tc_gred_qopt_offload_params set; struct tc_gred_qopt_offload_stats stats; }; }; enum tc_prio_command { TC_PRIO_REPLACE, TC_PRIO_DESTROY, TC_PRIO_STATS, TC_PRIO_GRAFT, }; struct tc_prio_qopt_offload_params { int bands; u8 priomap[TC_PRIO_MAX + 1]; /* At the point of un-offloading the Qdisc, the reported backlog and * qlen need to be reduced by the portion that is in HW. */ struct gnet_stats_queue *qstats; }; struct tc_prio_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_prio_qopt_offload { enum tc_prio_command command; u32 handle; u32 parent; union { struct tc_prio_qopt_offload_params replace_params; struct tc_qopt_offload_stats stats; struct tc_prio_qopt_offload_graft_params graft_params; }; }; enum tc_root_command { TC_ROOT_GRAFT, }; struct tc_root_qopt_offload { enum tc_root_command command; u32 handle; bool ingress; }; enum tc_ets_command { TC_ETS_REPLACE, TC_ETS_DESTROY, TC_ETS_STATS, TC_ETS_GRAFT, }; struct tc_ets_qopt_offload_replace_params { unsigned int bands; u8 priomap[TC_PRIO_MAX + 1]; unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ unsigned int weights[TCQ_ETS_MAX_BANDS]; struct gnet_stats_queue *qstats; }; struct tc_ets_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_ets_qopt_offload { enum tc_ets_command command; u32 handle; u32 parent; union { struct tc_ets_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; struct tc_ets_qopt_offload_graft_params graft_params; }; }; enum tc_tbf_command { TC_TBF_REPLACE, TC_TBF_DESTROY, TC_TBF_STATS, TC_TBF_GRAFT, }; struct tc_tbf_qopt_offload_replace_params { struct psched_ratecfg rate; u32 max_size; struct gnet_stats_queue *qstats; }; struct tc_tbf_qopt_offload { enum tc_tbf_command command; u32 handle; u32 parent; union { struct tc_tbf_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; u32 child_handle; }; }; enum tc_fifo_command { TC_FIFO_REPLACE, TC_FIFO_DESTROY, TC_FIFO_STATS, }; struct tc_fifo_qopt_offload { enum tc_fifo_command command; u32 handle; u32 parent; union { struct tc_qopt_offload_stats stats; }; }; #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void); void tc_skb_ext_tc_disable(void); #define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) #else /* CONFIG_NET_CLS_ACT */ static inline void tc_skb_ext_tc_enable(void) { } static inline void tc_skb_ext_tc_disable(void) { } #define tc_skb_ext_tc_enabled() false #endif #endif
4124 2863 1321 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __LICENSE_H #define __LICENSE_H static inline int license_is_gpl_compatible(const char *license) { return (strcmp(license, "GPL") == 0 || strcmp(license, "GPL v2") == 0 || strcmp(license, "GPL and additional rights") == 0 || strcmp(license, "Dual BSD/GPL") == 0 || strcmp(license, "Dual MIT/GPL") == 0 || strcmp(license, "Dual MPL/GPL") == 0); } #endif
38 1173 1 1 1 268 268 1 1 6 6 290 289 173 173 173 173 9 9 9 7469 7466 32 32 32 32 32 32 32 32 32 1604 1605 1606 1607 1608 1612 1612 1607 171 1602 1607 1604 1604 1494 1500 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); p->vfork_done = &kthread->exited; p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param); set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); if (cpu >= 0) node = cpu_to_node(cpu); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; if (cpu >= 0) kthread_bind(task, cpu); worker->flags = flags; worker->task = task; wake_up_process(task); return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker - create a kthread worker * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(-1, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(cpu, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * del_timer_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif
570 573 572 573 572 138 5 133 220 41 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/symlink.c - sysfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> #include <linux/module.h> #include <linux/kobject.h> #include <linux/mutex.h> #include <linux/security.h> #include "sysfs.h" static int sysfs_do_create_link_sd(struct kernfs_node *parent, struct kobject *target_kobj, const char *name, int warn) { struct kernfs_node *kn, *target = NULL; if (WARN_ON(!name || !parent)) return -EINVAL; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (target_kobj->sd) { target = target_kobj->sd; kernfs_get(target); } spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; kn = kernfs_create_link(parent, name, target); kernfs_put(target); if (!IS_ERR(kn)) return 0; if (warn && PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn); } /** * sysfs_create_link_sd - create symlink to a given object. * @kn: directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name) { return sysfs_do_create_link_sd(kn, target, name, 1); } static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn) { struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (!parent) return -EFAULT; return sysfs_do_create_link_sd(parent, target, name, warn); } /** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return sysfs_do_create_link(kobj, target, name, 1); } EXPORT_SYMBOL_GPL(sysfs_create_link); /** * sysfs_create_link_nowarn - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. * * This function does the same as sysfs_create_link(), but it * doesn't warn if the link already exists. */ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return sysfs_do_create_link(kobj, target, name, 0); } EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn); /** * sysfs_delete_link - remove symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @name: name of the symlink to remove. * * Unlike sysfs_remove_link sysfs_delete_link has enough information * to successfully delete symlinks in tagged directories. */ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, const char *name) { const void *ns = NULL; /* * We don't own @target and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); if (targ->sd && kernfs_ns_enabled(kobj->sd)) ns = targ->sd->ns; spin_unlock(&sysfs_symlink_target_lock); kernfs_remove_by_name_ns(kobj->sd, name, ns); } /** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. * @name: name of the symlink to remove. */ void sysfs_remove_link(struct kobject *kobj, const char *name) { struct kernfs_node *parent = NULL; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; kernfs_remove_by_name(parent, name); } EXPORT_SYMBOL_GPL(sysfs_remove_link); /** * sysfs_rename_link_ns - rename symlink in object's directory. * @kobj: object we're acting for. * @targ: object we're pointing to. * @old: previous name of the symlink. * @new: new name of the symlink. * @new_ns: new namespace of the symlink. * * A helper function for the common rename symlink idiom. */ int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, const char *old, const char *new, const void *new_ns) { struct kernfs_node *parent, *kn = NULL; const void *old_ns = NULL; int result; if (!kobj) parent = sysfs_root_kn; else parent = kobj->sd; if (targ->sd) old_ns = targ->sd->ns; result = -ENOENT; kn = kernfs_find_and_get_ns(parent, old, old_ns); if (!kn) goto out; result = -EINVAL; if (kernfs_type(kn) != KERNFS_LINK) goto out; if (kn->symlink.target_kn->priv != targ) goto out; result = kernfs_rename_ns(kn, parent, new, new_ns); out: kernfs_put(kn); return result; } EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
12 25 12 12 12 12 24 12 26 1 1 24 12 12 12 107 107 112 113 112 114 103 114 2 2 4 113 113 103 26 3 18 7 25 81 6 4 3 2 3 1 1 7 89 2 17 10 2 24 15 4 6 7 3 5 5 2 1 1 1 5 1 4 1 1 3 3 1 1 6 6 6 6 6 9 1 2 6 6 6 6 5 14 14 17 14 14 5 1 4 4 1 3 2 10 4 3 4 7 7 7 7 7 7 7 1 6 7 7 1 1 3 3 2 2 3 3 7 7 2 11 11 11 10 4 4 114 73 72 4 6 5 5 45 18 27 1 1 1 1 14 1 2 4 142 1 2 138 20 114 81 3 31 12 1 2 2 8 10 1 9 9 8 3 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/key/af_key.c An implementation of PF_KEYv2 sockets. * * Authors: Maxim Giryaev <gem@asplinux.ru> * David S. Miller <davem@redhat.com> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * Kazunori MIYAZAWA / USAGI Project <miyazawa@linux-ipv6.org> * Derek Atkins <derek@ihtfp.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/socket.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/xfrm.h> #include <net/sock.h> #define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) #define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) static unsigned int pfkey_net_id __read_mostly; struct netns_pfkey { /* List of all pfkey sockets. */ struct hlist_head table; atomic_t socks_nr; }; static DEFINE_MUTEX(pfkey_mutex); #define DUMMY_MARK 0 static const struct xfrm_mark dummy_mark = {0, 0}; struct pfkey_sock { /* struct sock must be the first member of struct pfkey_sock */ struct sock sk; int registered; int promisc; struct { uint8_t msg_version; uint32_t msg_portid; int (*dump)(struct pfkey_sock *sk); void (*done)(struct pfkey_sock *sk); union { struct xfrm_policy_walk policy; struct xfrm_state_walk state; } u; struct sk_buff *skb; } dump; struct mutex dump_lock; }; static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family); static inline struct pfkey_sock *pfkey_sk(struct sock *sk) { return (struct pfkey_sock *)sk; } static int pfkey_can_dump(const struct sock *sk) { if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf) return 1; return 0; } static void pfkey_terminate_dump(struct pfkey_sock *pfk) { if (pfk->dump.dump) { if (pfk->dump.skb) { kfree_skb(pfk->dump.skb); pfk->dump.skb = NULL; } pfk->dump.done(pfk); pfk->dump.dump = NULL; pfk->dump.done = NULL; } } static void pfkey_sock_destruct(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_terminate_dump(pfkey_sk(sk)); skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive pfkey socket: %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); atomic_dec(&net_pfkey->socks_nr); } static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); mutex_lock(&pfkey_mutex); sk_add_node_rcu(sk, &net_pfkey->table); mutex_unlock(&pfkey_mutex); } static void pfkey_remove(struct sock *sk) { mutex_lock(&pfkey_mutex); sk_del_node_init_rcu(sk); mutex_unlock(&pfkey_mutex); } static struct proto key_proto = { .name = "KEY", .owner = THIS_MODULE, .obj_size = sizeof(struct pfkey_sock), }; static int pfkey_create(struct net *net, struct socket *sock, int protocol, int kern) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; struct pfkey_sock *pfk; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol != PF_KEY_V2) return -EPROTONOSUPPORT; sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) return -ENOMEM; pfk = pfkey_sk(sk); mutex_init(&pfk->dump_lock); sock->ops = &pfkey_ops; sock_init_data(sock, sk); sk->sk_family = PF_KEY; sk->sk_destruct = pfkey_sock_destruct; atomic_inc(&net_pfkey->socks_nr); pfkey_insert(sk); return 0; } static int pfkey_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; pfkey_remove(sk); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_write_queue); synchronize_rcu(); sock_put(sk); return 0; } static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation, struct sock *sk) { int err = -ENOBUFS; if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return err; skb = skb_clone(skb, allocation); if (skb) { skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); err = 0; } return err; } /* Send SKB to all pfkey sockets matching selected criteria. */ #define BROADCAST_ALL 0 #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; int err = -ESRCH; /* XXX Do we need something like netlink_overrun? I think * XXX PF_KEY socket apps will not mind current behavior. */ if (!skb) return -ENOMEM; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { struct pfkey_sock *pfk = pfkey_sk(sk); int err2; /* Yes, it means that if you are meant to receive this * pfkey message you receive it twice as promiscuous * socket. */ if (pfk->promisc) pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) continue; if (broadcast_flags != BROADCAST_ALL) { if (broadcast_flags & BROADCAST_PROMISC_ONLY) continue; if ((broadcast_flags & BROADCAST_REGISTERED) && !pfk->registered) continue; if (broadcast_flags & BROADCAST_ONE) continue; } err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* Error is cleared after successful sending to at least one * registered KM */ if ((broadcast_flags & BROADCAST_REGISTERED) && err) err = err2; } rcu_read_unlock(); if (one_sk != NULL) err = pfkey_broadcast_one(skb, allocation, one_sk); kfree_skb(skb); return err; } static int pfkey_do_dump(struct pfkey_sock *pfk) { struct sadb_msg *hdr; int rc; mutex_lock(&pfk->dump_lock); if (!pfk->dump.dump) { rc = 0; goto out; } rc = pfk->dump.dump(pfk); if (rc == -ENOBUFS) { rc = 0; goto out; } if (pfk->dump.skb) { if (!pfkey_can_dump(&pfk->sk)) { rc = 0; goto out; } hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } pfkey_terminate_dump(pfk); out: mutex_unlock(&pfk->dump_lock); return rc; } static inline void pfkey_hdr_dup(struct sadb_msg *new, const struct sadb_msg *orig) { *new = *orig; } static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) { struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); struct sadb_msg *hdr; if (!skb) return -ENOBUFS; /* Woe be to the platform trying to support PFKEY yet * having normal errnos outside the 1-255 range, inclusive. */ err = -err; if (err == ERESTARTSYS || err == ERESTARTNOHAND || err == ERESTARTNOINTR) err = EINTR; if (err >= 512) err = EINVAL; BUG_ON(err <= 0 || err >= 256); hdr = skb_put(skb, sizeof(struct sadb_msg)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = (uint8_t) err; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static const u8 sadb_ext_min_len[] = { [SADB_EXT_RESERVED] = (u8) 0, [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address), [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key), [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key), [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident), [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident), [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens), [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop), [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange), [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate), [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy), [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2), [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type), [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), [SADB_X_EXT_KMADDRESS] = (u8) sizeof(struct sadb_x_kmaddress), [SADB_X_EXT_FILTER] = (u8) sizeof(struct sadb_x_filter), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ static int verify_address_len(const void *p) { const struct sadb_address *sp = p; const struct sockaddr *addr = (const struct sockaddr *)(sp + 1); const struct sockaddr_in *sin; #if IS_ENABLED(CONFIG_IPV6) const struct sockaddr_in6 *sin6; #endif int len; if (sp->sadb_address_len < DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), sizeof(uint64_t))) return -EINVAL; switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 32) return -EINVAL; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 128) return -EINVAL; break; #endif default: /* It is user using kernel to keep track of security * associations for another protocol, such as * OSPF/RSVP/RIPV2/MIP. It is user's job to verify * lengths. * * XXX Actually, association/policy database is not yet * XXX able to cope with arbitrary sockaddr families. * XXX When it can, remove this -EINVAL. -DaveM */ return -EINVAL; } return 0; } static inline int sadb_key_len(const struct sadb_key *key) { int key_bytes = DIV_ROUND_UP(key->sadb_key_bits, 8); return DIV_ROUND_UP(sizeof(struct sadb_key) + key_bytes, sizeof(uint64_t)); } static int verify_key_len(const void *p) { const struct sadb_key *key = p; if (sadb_key_len(key) > key->sadb_key_len) return -EINVAL; return 0; } static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx) { return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) + sec_ctx->sadb_x_ctx_len, sizeof(uint64_t)); } static inline int verify_sec_ctx_len(const void *p) { const struct sadb_x_sec_ctx *sec_ctx = p; int len = sec_ctx->sadb_x_ctx_len; if (len > PAGE_SIZE) return -EINVAL; len = pfkey_sec_ctx_len(sec_ctx); if (sec_ctx->sadb_x_sec_len != len) return -EINVAL; return 0; } static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx, gfp_t gfp) { struct xfrm_user_sec_ctx *uctx = NULL; int ctx_size = sec_ctx->sadb_x_ctx_len; uctx = kmalloc((sizeof(*uctx)+ctx_size), gfp); if (!uctx) return NULL; uctx->len = pfkey_sec_ctx_len(sec_ctx); uctx->exttype = sec_ctx->sadb_x_sec_exttype; uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; uctx->ctx_len = sec_ctx->sadb_x_ctx_len; memcpy(uctx + 1, sec_ctx + 1, uctx->ctx_len); return uctx; } static int present_and_same_family(const struct sadb_address *src, const struct sadb_address *dst) { const struct sockaddr *s_addr, *d_addr; if (!src || !dst) return 0; s_addr = (const struct sockaddr *)(src + 1); d_addr = (const struct sockaddr *)(dst + 1); if (s_addr->sa_family != d_addr->sa_family) return 0; if (s_addr->sa_family != AF_INET #if IS_ENABLED(CONFIG_IPV6) && s_addr->sa_family != AF_INET6 #endif ) return 0; return 1; } static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs) { const char *p = (char *) hdr; int len = skb->len; len -= sizeof(*hdr); p += sizeof(*hdr); while (len > 0) { const struct sadb_ext *ehdr = (const struct sadb_ext *) p; uint16_t ext_type; int ext_len; if (len < sizeof(*ehdr)) return -EINVAL; ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; if (ext_len < sizeof(uint64_t) || ext_len > len || ext_type == SADB_EXT_RESERVED) return -EINVAL; if (ext_type <= SADB_EXT_MAX) { int min = (int) sadb_ext_min_len[ext_type]; if (ext_len < min) return -EINVAL; if (ext_hdrs[ext_type-1] != NULL) return -EINVAL; switch (ext_type) { case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: case SADB_X_EXT_NAT_T_OA: if (verify_address_len(p)) return -EINVAL; break; case SADB_X_EXT_SEC_CTX: if (verify_sec_ctx_len(p)) return -EINVAL; break; case SADB_EXT_KEY_AUTH: case SADB_EXT_KEY_ENCRYPT: if (verify_key_len(p)) return -EINVAL; break; default: break; } ext_hdrs[ext_type-1] = (void *) p; } p += ext_len; len -= ext_len; } return 0; } static uint16_t pfkey_satype2proto(uint8_t satype) { switch (satype) { case SADB_SATYPE_UNSPEC: return IPSEC_PROTO_ANY; case SADB_SATYPE_AH: return IPPROTO_AH; case SADB_SATYPE_ESP: return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_COMP; default: return 0; } /* NOTREACHED */ } static uint8_t pfkey_proto2satype(uint16_t proto) { switch (proto) { case IPPROTO_AH: return SADB_SATYPE_AH; case IPPROTO_ESP: return SADB_SATYPE_ESP; case IPPROTO_COMP: return SADB_X_SATYPE_IPCOMP; default: return 0; } /* NOTREACHED */ } /* BTW, this scheme means that there is no way with PFKEY2 sockets to * say specifically 'just raw sockets' as we encode them as 255. */ static uint8_t pfkey_proto_to_xfrm(uint8_t proto) { return proto == IPSEC_PROTO_ANY ? 0 : proto; } static uint8_t pfkey_proto_from_xfrm(uint8_t proto) { return proto ? proto : IPSEC_PROTO_ANY; } static inline int pfkey_sockaddr_len(sa_family_t family) { switch (family) { case AF_INET: return sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return sizeof(struct sockaddr_in6); #endif } return 0; } static int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr) { switch (sa->sa_family) { case AF_INET: xaddr->a4 = ((struct sockaddr_in *)sa)->sin_addr.s_addr; return AF_INET; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: memcpy(xaddr->a6, &((struct sockaddr_in6 *)sa)->sin6_addr, sizeof(struct in6_addr)); return AF_INET6; #endif } return 0; } static int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr) { return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1), xaddr); } static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { const struct sadb_sa *sa; const struct sadb_address *addr; uint16_t proto; unsigned short family; xfrm_address_t *xaddr; sa = ext_hdrs[SADB_EXT_SA - 1]; if (sa == NULL) return NULL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return NULL; /* sadb_address_len should be checked by caller */ addr = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; if (addr == NULL) return NULL; family = ((const struct sockaddr *)(addr + 1))->sa_family; switch (family) { case AF_INET: xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr; break; #endif default: xaddr = NULL; } if (!xaddr) return NULL; return xfrm_state_lookup(net, DUMMY_MARK, xaddr, sa->sadb_sa_spi, proto, family); } #define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) static int pfkey_sockaddr_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family)); } static inline int pfkey_mode_from_xfrm(int mode) { switch(mode) { case XFRM_MODE_TRANSPORT: return IPSEC_MODE_TRANSPORT; case XFRM_MODE_TUNNEL: return IPSEC_MODE_TUNNEL; case XFRM_MODE_BEET: return IPSEC_MODE_BEET; default: return -1; } } static inline int pfkey_mode_to_xfrm(int mode) { switch(mode) { case IPSEC_MODE_ANY: /*XXX*/ case IPSEC_MODE_TRANSPORT: return XFRM_MODE_TRANSPORT; case IPSEC_MODE_TUNNEL: return XFRM_MODE_TUNNEL; case IPSEC_MODE_BEET: return XFRM_MODE_BEET; default: return -1; } } static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port, struct sockaddr *sa, unsigned short family) { switch (family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sa; sin->sin_family = AF_INET; sin->sin_port = port; sin->sin_addr.s_addr = xaddr->a4; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return 32; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; sin6->sin6_addr = xaddr->in6; sin6->sin6_scope_id = 0; return 128; } #endif } return 0; } static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, int add_keys, int hsc) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_lifetime *lifetime; struct sadb_address *addr; struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int size; int auth_key_size = 0; int encrypt_key_size = 0; int sockaddr_size; struct xfrm_encap_tmpl *natt = NULL; int mode; /* address family check */ sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return ERR_PTR(-EINVAL); /* base, SA, (lifetime (HSC),) address(SD), (address(P),) key(AE), (identity(SD),) (sensitivity)> */ size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) + sizeof(struct sadb_lifetime) + ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) + ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) + sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } /* identity & sensitivity */ if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) size += sizeof(struct sadb_address) + sockaddr_size; if (add_keys) { if (x->aalg && x->aalg->alg_key_len) { auth_key_size = PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8); size += sizeof(struct sadb_key) + auth_key_size; } if (x->ealg && x->ealg->alg_key_len) { encrypt_key_size = PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8); size += sizeof(struct sadb_key) + encrypt_key_size; } } if (x->encap) natt = x->encap; if (natt && natt->encap_type) { size += sizeof(struct sadb_x_nat_t_type); size += sizeof(struct sadb_x_nat_t_port); size += sizeof(struct sadb_x_nat_t_port); } skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ hdr->sadb_msg_len = size / sizeof(uint64_t); /* sa */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = x->props.replay_window; switch (x->km.state) { case XFRM_STATE_VALID: sa->sadb_sa_state = x->km.dying ? SADB_SASTATE_DYING : SADB_SASTATE_MATURE; break; case XFRM_STATE_ACQ: sa->sadb_sa_state = SADB_SASTATE_LARVAL; break; default: sa->sadb_sa_state = SADB_SASTATE_DEAD; break; } sa->sadb_sa_auth = 0; if (x->aalg) { struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); sa->sadb_sa_auth = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_encrypt = 0; BUG_ON(x->ealg && x->calg); if (x->ealg) { struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ if (x->calg) { struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_flags = 0; if (x->props.flags & XFRM_STATE_NOECN) sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; if (x->props.flags & XFRM_STATE_DECAP_DSCP) sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; if (x->props.flags & XFRM_STATE_NOPMTUDISC) sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC; /* hard time */ if (hsc & 2) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds; } /* soft time */ if (hsc & 1) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds; } /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = x->curlft.packets; lifetime->sadb_lifetime_bytes = x->curlft.bytes; lifetime->sadb_lifetime_addtime = x->curlft.add_time; lifetime->sadb_lifetime_usetime = x->curlft.use_time; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; /* "if the ports are non-zero, then the sadb_address_proto field, normally zero, MUST be filled in with the transport protocol's number." - RFC2367 */ addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) { addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; addr->sadb_address_proto = pfkey_proto_from_xfrm(x->sel.proto); addr->sadb_address_prefixlen = x->sel.prefixlen_s; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&x->sel.saddr, x->sel.sport, (struct sockaddr *) (addr + 1), x->props.family); } /* auth key */ if (add_keys && auth_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + auth_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_AUTH; key->sadb_key_bits = x->aalg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8); } /* encrypt key */ if (add_keys && encrypt_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + encrypt_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + encrypt_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT; key->sadb_key_bits = x->ealg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->ealg->alg_key, (x->ealg->alg_key_len+7)/8); } /* sa */ sa2 = skb_put(skb, sizeof(struct sadb_x_sa2)); sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t); sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2; if ((mode = pfkey_mode_from_xfrm(x->props.mode)) < 0) { kfree_skb(skb); return ERR_PTR(-EINVAL); } sa2->sadb_x_sa2_mode = mode; sa2->sadb_x_sa2_reserved1 = 0; sa2->sadb_x_sa2_reserved2 = 0; sa2->sadb_x_sa2_sequence = 0; sa2->sadb_x_sa2_reqid = x->props.reqid; if (natt && natt->encap_type) { struct sadb_x_nat_t_type *n_type; struct sadb_x_nat_t_port *n_port; /* type */ n_type = skb_put(skb, sizeof(*n_type)); n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t); n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; n_type->sadb_x_nat_t_type_type = natt->encap_type; n_type->sadb_x_nat_t_type_reserved[0] = 0; n_type->sadb_x_nat_t_type_reserved[1] = 0; n_type->sadb_x_nat_t_type_reserved[2] = 0; /* source port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* dest port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = natt->encap_dport; n_port->sadb_x_nat_t_port_reserved = 0; } /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x) { struct sk_buff *skb; skb = __pfkey_xfrm_state2msg(x, 1, 3); return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x, int hsc) { return __pfkey_xfrm_state2msg(x, 0, hsc); } static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct xfrm_state *x; const struct sadb_lifetime *lifetime; const struct sadb_sa *sa; const struct sadb_key *key; const struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; sa = ext_hdrs[SADB_EXT_SA - 1]; if (!sa || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_ESP && !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_AH && !ext_hdrs[SADB_EXT_KEY_AUTH-1]) return ERR_PTR(-EINVAL); if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] != !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) return ERR_PTR(-EINVAL); proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return ERR_PTR(-EINVAL); /* default error is no buffer space */ err = -ENOBUFS; /* RFC2367: Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message. SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state. Therefore, the sadb_sa_state field of all submitted SAs MUST be SADB_SASTATE_MATURE and the kernel MUST return an error if this is not true. However, KAME setkey always uses SADB_SASTATE_LARVAL. Hence, we have to _ignore_ sadb_sa_state, which is also reasonable. */ if (sa->sadb_sa_auth > SADB_AALG_MAX || (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP && sa->sadb_sa_encrypt > SADB_X_CALG_MAX) || sa->sadb_sa_encrypt > SADB_EALG_MAX) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (key != NULL && sa->sadb_sa_auth != SADB_X_AALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key != NULL && sa->sadb_sa_encrypt != SADB_EALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); x = xfrm_state_alloc(net); if (x == NULL) return ERR_PTR(-ENOBUFS); x->id.proto = proto; x->id.spi = sa->sadb_sa_spi; x->props.replay_window = min_t(unsigned int, sa->sadb_sa_replay, (sizeof(x->replay.bitmap) * 8)); if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN) x->props.flags |= XFRM_STATE_NOECN; if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) x->props.flags |= XFRM_STATE_DECAP_DSCP; if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC) x->props.flags |= XFRM_STATE_NOPMTUDISC; lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD - 1]; if (lifetime != NULL) { x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT - 1]; if (lifetime != NULL) { x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) goto out; err = security_xfrm_state_alloc(x, uctx); kfree(uctx); if (err) goto out; } err = -ENOBUFS; key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (sa->sadb_sa_auth) { int keysize = 0; struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } if (key) keysize = (key->sadb_key_bits + 7) / 8; x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); if (!x->aalg) { err = -ENOMEM; goto out; } strcpy(x->aalg->alg_name, a->name); x->aalg->alg_key_len = 0; if (key) { x->aalg->alg_key_len = key->sadb_key_bits; memcpy(x->aalg->alg_key, key+1, keysize); } x->aalg->alg_trunc_len = a->uinfo.auth.icv_truncbits; x->props.aalgo = sa->sadb_sa_auth; /* x->algo.flags = sa->sadb_sa_flags; */ } if (sa->sadb_sa_encrypt) { if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); if (!x->calg) { err = -ENOMEM; goto out; } strcpy(x->calg->alg_name, a->name); x->props.calgo = sa->sadb_sa_encrypt; } else { int keysize = 0; struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key) keysize = (key->sadb_key_bits + 7) / 8; x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); if (!x->ealg) { err = -ENOMEM; goto out; } strcpy(x->ealg->alg_name, a->name); x->ealg->alg_key_len = 0; if (key) { x->ealg->alg_key_len = key->sadb_key_bits; memcpy(x->ealg->alg_key, key+1, keysize); } x->props.ealgo = sa->sadb_sa_encrypt; x->geniv = a->uinfo.encr.geniv; } } /* x->algo.flags = sa->sadb_sa_flags; */ x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], &x->props.saddr); pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], &x->id.daddr); if (ext_hdrs[SADB_X_EXT_SA2-1]) { const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1]; int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) { err = -EINVAL; goto out; } x->props.mode = mode; x->props.reqid = sa2->sadb_x_sa2_reqid; } if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) { const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]; /* Nobody uses this, but we try. */ x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr); x->sel.prefixlen_s = addr->sadb_address_prefixlen; } if (!x->sel.family) x->sel.family = x->props.family; if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); if (!x->encap) { err = -ENOMEM; goto out; } natt = x->encap; n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; natt->encap_type = n_type->sadb_x_nat_t_type_type; if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; natt->encap_sport = n_port->sadb_x_nat_t_port_port; } if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } } err = xfrm_init_state(x); if (err) goto out; x->km.seq = hdr->sadb_msg_seq; return x; out: x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); return ERR_PTR(err); } static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -EOPNOTSUPP; } static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct sk_buff *resp_skb; struct sadb_x_sa2 *sa2; struct sadb_address *saddr, *daddr; struct sadb_msg *out_hdr; struct sadb_spirange *range; struct xfrm_state *x = NULL; int mode; int err; u32 min_spi, max_spi; u32 reqid; u8 proto; unsigned short family; xfrm_address_t *xsaddr = NULL, *xdaddr = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) { mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) return -EINVAL; reqid = sa2->sadb_x_sa2_reqid; } else { mode = 0; reqid = 0; } saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; family = ((struct sockaddr *)(saddr + 1))->sa_family; switch (family) { case AF_INET: xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr; break; #endif } if (hdr->sadb_msg_seq) { x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; } } if (!x) x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX, proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; min_spi = 0x100; max_spi = 0x0fffffff; range = ext_hdrs[SADB_EXT_SPIRANGE-1]; if (range) { min_spi = range->sadb_spirange_min; max_spi = range->sadb_spirange_max; } err = verify_spi_info(x->id.proto, min_spi, max_spi, NULL); if (err) { xfrm_state_put(x); return err; } err = xfrm_alloc_spi(x, min_spi, max_spi, NULL); resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x); if (IS_ERR(resp_skb)) { xfrm_state_put(x); return PTR_ERR(resp_skb); } out_hdr = (struct sadb_msg *) resp_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GETSPI; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; xfrm_state_put(x); pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8) return -EOPNOTSUPP; if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) return 0; x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x == NULL) return 0; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_ACQ) x->km.state = XFRM_STATE_ERROR; spin_unlock_bh(&x->lock); xfrm_state_put(x); return 0; } static inline int event2poltype(int event) { switch (event) { case XFRM_MSG_DELPOLICY: return SADB_X_SPDDELETE; case XFRM_MSG_NEWPOLICY: return SADB_X_SPDADD; case XFRM_MSG_UPDPOLICY: return SADB_X_SPDUPDATE; case XFRM_MSG_POLEXPIRE: // return SADB_X_SPDEXPIRE; default: pr_err("pfkey: Unknown policy event %d\n", event); break; } return 0; } static inline int event2keytype(int event) { switch (event) { case XFRM_MSG_DELSA: return SADB_DELETE; case XFRM_MSG_NEWSA: return SADB_ADD; case XFRM_MSG_UPDSA: return SADB_UPDATE; case XFRM_MSG_EXPIRE: return SADB_EXPIRE; default: pr_err("pfkey: Unknown SA event %d\n", event); break; } return 0; } /* ADD/UPD/DEL */ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = pfkey_xfrm_state2msg(x); if (IS_ERR(skb)) return PTR_ERR(skb); hdr = (struct sadb_msg *) skb->data; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = event2keytype(c->event); hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; int err; struct km_event c; x = pfkey_msg2xfrm_state(net, hdr, ext_hdrs); if (IS_ERR(x)) return PTR_ERR(x); xfrm_state_hold(x); if (hdr->sadb_msg_type == SADB_ADD) err = xfrm_state_add(x); else err = xfrm_state_update(x); xfrm_audit_state_add(x, err ? 0 : 1, true); if (err < 0) { x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); goto out; } if (hdr->sadb_msg_type == SADB_ADD) c.event = XFRM_MSG_NEWSA; else c.event = XFRM_MSG_UPDSA; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_state_notify(x, &c); out: xfrm_state_put(x); return err; } static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; struct km_event c; int err; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; if ((err = security_xfrm_state_delete(x))) goto out; if (xfrm_state_kern(x)) { err = -EPERM; goto out; } err = xfrm_state_delete(x); if (err < 0) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELSA; km_state_notify(x, &c); out: xfrm_audit_state_delete(x, err ? 0 : 1, true); xfrm_state_put(x); return err; } static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); __u8 proto; struct sk_buff *out_skb; struct sadb_msg *out_hdr; struct xfrm_state *x; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; out_skb = pfkey_xfrm_state2msg(x); proto = x->id.proto; xfrm_state_put(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GET; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, gfp_t allocation) { struct sk_buff *skb; struct sadb_msg *hdr; int len, auth_len, enc_len, i; auth_len = xfrm_count_pfkey_auth_supported(); if (auth_len) { auth_len *= sizeof(struct sadb_alg); auth_len += sizeof(struct sadb_supported); } enc_len = xfrm_count_pfkey_enc_supported(); if (enc_len) { enc_len *= sizeof(struct sadb_alg); enc_len += sizeof(struct sadb_supported); } len = enc_len + auth_len + sizeof(struct sadb_msg); skb = alloc_skb(len + 16, allocation); if (!skb) goto out_put_algs; hdr = skb_put(skb, sizeof(*hdr)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = 0; hdr->sadb_msg_len = len / sizeof(uint64_t); if (auth_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, auth_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = auth_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; for (i = 0; ; i++) { struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg->available) *ap++ = aalg->desc; } } if (enc_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, enc_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = enc_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; for (i = 0; ; i++) { struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (ealg->available) *ap++ = ealg->desc; } } out_put_algs: return skb; } static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *supp_skb; if (hdr->sadb_msg_satype > SADB_SATYPE_MAX) return -EINVAL; if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) { if (pfk->registered&(1<<hdr->sadb_msg_satype)) return -EEXIST; pfk->registered |= (1<<hdr->sadb_msg_satype); } mutex_lock(&pfkey_mutex); xfrm_probe_algs(); supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); mutex_unlock(&pfkey_mutex); if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<<hdr->sadb_msg_satype); return -ENOBUFS; } pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk)); return 0; } static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put_data(skb, ihdr, sizeof(struct sadb_msg)); hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto); hdr->sadb_msg_type = SADB_FLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int proto; struct km_event c; int err, err2; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; err = xfrm_state_flush(net, proto, true, false); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ err = 0; return err ? err : err2; } c.data.proto = proto; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_FLUSHSA; c.net = net; km_state_notify(NULL, &c); return 0; } static int dump_sa(struct xfrm_state *x, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_state2msg(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_DUMP; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sa(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_state_walk(net, &pfk->dump.u.state, dump_sa, (void *) pfk); } static void pfkey_dump_sa_done(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); xfrm_state_walk_done(&pfk->dump.u.state, net); } static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { u8 proto; struct xfrm_address_filter *filter = NULL; struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; if ((xfilter->sadb_x_filter_splen > (sizeof(xfrm_address_t) << 3)) || (xfilter->sadb_x_filter_dplen > (sizeof(xfrm_address_t) << 3))) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } filter = kmalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) { mutex_unlock(&pfk->dump_lock); return -ENOMEM; } memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, sizeof(xfrm_address_t)); memcpy(&filter->daddr, &xfilter->sadb_x_filter_daddr, sizeof(xfrm_address_t)); filter->family = xfilter->sadb_x_filter_family; filter->splen = xfilter->sadb_x_filter_splen; filter->dplen = xfilter->sadb_x_filter_dplen; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sa; pfk->dump.done = pfkey_dump_sa_done; xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); int satype = hdr->sadb_msg_satype; bool reset_errno = false; if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) { reset_errno = true; if (satype != 0 && satype != 1) return -EINVAL; pfk->promisc = satype; } if (reset_errno && skb_cloned(skb)) skb = skb_copy(skb, GFP_KERNEL); else skb = skb_clone(skb, GFP_KERNEL); if (reset_errno && skb) { struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data; new_hdr->sadb_msg_errno = 0; } pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr) { int i; u32 reqid = *(u32*)ptr; for (i=0; i<xp->xfrm_nr; i++) { if (xp->xfrm_vec[i].reqid == reqid) return -EEXIST; } return 0; } static u32 gen_reqid(struct net *net) { struct xfrm_policy_walk walk; u32 start; int rc; static u32 reqid = IPSEC_MANUAL_REQID_MAX; start = reqid; do { ++reqid; if (reqid == 0) reqid = IPSEC_MANUAL_REQID_MAX+1; xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN); rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid); xfrm_policy_walk_done(&walk, net); if (rc != -EEXIST) return reqid; } while (reqid != start); return 0; } static int parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_policy *pol, struct sadb_x_ipsecrequest *rq) { struct net *net = xp_net(xp); struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; int mode; if (xp->xfrm_nr >= XFRM_MAX_DEPTH) return -ELOOP; if (rq->sadb_x_ipsecrequest_mode == 0) return -EINVAL; if (!xfrm_id_proto_valid(rq->sadb_x_ipsecrequest_proto)) return -EINVAL; t->id.proto = rq->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) { if ((mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_BEET) && pol->sadb_x_policy_dir == IPSEC_DIR_OUTBOUND) return -EINVAL; t->optional = 1; } else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { t->reqid = rq->sadb_x_ipsecrequest_reqid; if (t->reqid > IPSEC_MANUAL_REQID_MAX) t->reqid = 0; if (!t->reqid && !(t->reqid = gen_reqid(net))) return -ENOBUFS; } /* addresses present only in tunnel mode */ if (t->mode == XFRM_MODE_TUNNEL) { int err; err = parse_sockaddr_pair( (struct sockaddr *)(rq + 1), rq->sadb_x_ipsecrequest_len - sizeof(*rq), &t->saddr, &t->id.daddr, &t->encap_family); if (err) return err; } else t->encap_family = xp->family; /* No way to set this via kame pfkey */ t->allalgs = 1; xp->xfrm_nr++; return 0; } static int parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) { int err; int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy); struct sadb_x_ipsecrequest *rq = (void*)(pol+1); if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) return -EINVAL; while (len >= sizeof(*rq)) { if (len < rq->sadb_x_ipsecrequest_len || rq->sadb_x_ipsecrequest_len < sizeof(*rq)) return -EINVAL; if ((err = parse_ipsecrequest(xp, pol, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); } return 0; } static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) { struct xfrm_sec_ctx *xfrm_ctx = xp->security; if (xfrm_ctx) { int len = sizeof(struct sadb_x_sec_ctx); len += xfrm_ctx->ctx_len; return PFKEY_ALIGN8(len); } return 0; } static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp) { const struct xfrm_tmpl *t; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = 0; int i; for (i=0; i<xp->xfrm_nr; i++) { t = xp->xfrm_vec + i; socklen += pfkey_sockaddr_len(t->encap_family); } return sizeof(struct sadb_msg) + (sizeof(struct sadb_lifetime) * 3) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * sizeof(struct sadb_x_ipsecrequest)) + (socklen * 2) + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp) { struct sk_buff *skb; int size; size = pfkey_xfrm_policy2msg_size(xp); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); return skb; } static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir) { struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int i; int size; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = pfkey_sockaddr_len(xp->family); size = pfkey_xfrm_policy2msg_size(xp); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_s; addr->sadb_address_reserved = 0; if (!pfkey_sockaddr_fill(&xp->selector.saddr, xp->selector.sport, (struct sockaddr *) (addr + 1), xp->family)) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_d; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&xp->selector.daddr, xp->selector.dport, (struct sockaddr *) (addr + 1), xp->family); /* hard time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds; /* soft time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds; /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = xp->curlft.packets; lifetime->sadb_lifetime_bytes = xp->curlft.bytes; lifetime->sadb_lifetime_addtime = xp->curlft.add_time; lifetime->sadb_lifetime_usetime = xp->curlft.use_time; pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD; if (xp->action == XFRM_POLICY_ALLOW) { if (xp->xfrm_nr) pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; else pol->sadb_x_policy_type = IPSEC_POLICY_NONE; } pol->sadb_x_policy_dir = dir+1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; for (i=0; i<xp->xfrm_nr; i++) { const struct xfrm_tmpl *t = xp->xfrm_vec + i; struct sadb_x_ipsecrequest *rq; int req_size; int mode; req_size = sizeof(struct sadb_x_ipsecrequest); if (t->mode == XFRM_MODE_TUNNEL) { socklen = pfkey_sockaddr_len(t->encap_family); req_size += socklen * 2; } else { size -= 2*socklen; } rq = skb_put(skb, req_size); pol->sadb_x_policy_len += req_size/8; memset(rq, 0, sizeof(*rq)); rq->sadb_x_ipsecrequest_len = req_size; rq->sadb_x_ipsecrequest_proto = t->id.proto; if ((mode = pfkey_mode_from_xfrm(t->mode)) < 0) return -EINVAL; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE; if (t->reqid) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE; if (t->optional) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE; rq->sadb_x_ipsecrequest_reqid = t->reqid; if (t->mode == XFRM_MODE_TUNNEL) { u8 *sa = (void *)(rq + 1); pfkey_sockaddr_fill(&t->saddr, 0, (struct sockaddr *)sa, t->encap_family); pfkey_sockaddr_fill(&t->id.daddr, 0, (struct sockaddr *) (sa + socklen), t->encap_family); } } /* security context */ if ((xfrm_ctx = xp->security)) { int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); sec_ctx = skb_put(skb, ctx_size); sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = refcount_read(&xp->refcnt); return 0; } static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; if (c->data.byid && c->event == XFRM_MSG_DELPOLICY) out_hdr->sadb_msg_type = SADB_X_SPDDELETE2; else out_hdr->sadb_msg_type = event2poltype(c->event); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC) return -EINVAL; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; xp = xfrm_policy_alloc(net, GFP_KERNEL); if (xp == NULL) return -ENOBUFS; xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->priority = pol->sadb_x_policy_priority; sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); xp->selector.family = xp->family; xp->selector.prefixlen_s = sa->sadb_address_prefixlen; xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.sport) xp->selector.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); xp->selector.prefixlen_d = sa->sadb_address_prefixlen; /* Amusing, we set this twice. KAME apps appear to set same value * in both addresses. */ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.dport) xp->selector.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) { err = -ENOBUFS; goto out; } err = security_xfrm_policy_alloc(&xp->security, uctx, GFP_KERNEL); kfree(uctx); if (err) goto out; } xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) { xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) { xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (err = parse_ipsecrequests(xp, pol)) < 0) goto out; err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; else c.event = XFRM_MSG_NEWPOLICY; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); xfrm_pol_put(xp); return 0; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return err; } static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct xfrm_selector sel; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *pol_ctx = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; memset(&sel, 0, sizeof(sel)); sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) return -ENOMEM; err = security_xfrm_policy_alloc(&pol_ctx, uctx, GFP_KERNEL); kfree(uctx); if (err) return err; } xp = xfrm_policy_bysel_ctx(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 1, &err); security_xfrm_policy_free(pol_ctx); if (xp == NULL) return -ENOENT; xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 0; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); out: xfrm_pol_put(xp); return err; } static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir) { int err; struct sk_buff *out_skb; struct sadb_msg *out_hdr; err = 0; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) { err = PTR_ERR(out_skb); goto out; } err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); goto out; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = hdr->sadb_msg_type; out_hdr->sadb_msg_satype = 0; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: return err; } static int pfkey_sockaddr_pair_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); } static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family) { int af, socklen; if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) return -EINVAL; af = pfkey_sockaddr_extract(sa, saddr); if (!af) return -EINVAL; socklen = pfkey_sockaddr_len(af); if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen), daddr) != af) return -EINVAL; *family = af; return 0; } #ifdef CONFIG_NET_KEY_MIGRATE static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct xfrm_migrate *m) { int err; struct sadb_x_ipsecrequest *rq2; int mode; if (len < sizeof(*rq1) || len < rq1->sadb_x_ipsecrequest_len || rq1->sadb_x_ipsecrequest_len < sizeof(*rq1)) return -EINVAL; /* old endoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), rq1->sadb_x_ipsecrequest_len - sizeof(*rq1), &m->old_saddr, &m->old_daddr, &m->old_family); if (err) return err; rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); len -= rq1->sadb_x_ipsecrequest_len; if (len <= sizeof(*rq2) || len < rq2->sadb_x_ipsecrequest_len || rq2->sadb_x_ipsecrequest_len < sizeof(*rq2)) return -EINVAL; /* new endpoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), rq2->sadb_x_ipsecrequest_len - sizeof(*rq2), &m->new_saddr, &m->new_daddr, &m->new_family); if (err) return err; if (rq1->sadb_x_ipsecrequest_proto != rq2->sadb_x_ipsecrequest_proto || rq1->sadb_x_ipsecrequest_mode != rq2->sadb_x_ipsecrequest_mode || rq1->sadb_x_ipsecrequest_reqid != rq2->sadb_x_ipsecrequest_reqid) return -EINVAL; m->proto = rq1->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq1->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; m->mode = mode; m->reqid = rq1->sadb_x_ipsecrequest_reqid; return ((int)(rq1->sadb_x_ipsecrequest_len + rq2->sadb_x_ipsecrequest_len)); } static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { int i, len, ret, err = -EINVAL; u8 dir; struct sadb_address *sa; struct sadb_x_kmaddress *kma; struct sadb_x_policy *pol; struct sadb_x_ipsecrequest *rq; struct xfrm_selector sel; struct xfrm_migrate m[XFRM_MAX_DEPTH]; struct xfrm_kmaddress k; struct net *net = sock_net(sk); if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1], ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) || !ext_hdrs[SADB_X_EXT_POLICY - 1]) { err = -EINVAL; goto out; } kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1]; pol = ext_hdrs[SADB_X_EXT_POLICY - 1]; if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) { err = -EINVAL; goto out; } if (kma) { /* convert sadb_x_kmaddress to xfrm_kmaddress */ k.reserved = kma->sadb_x_kmaddress_reserved; ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1), 8*(kma->sadb_x_kmaddress_len) - sizeof(*kma), &k.local, &k.remote, &k.family); if (ret < 0) { err = ret; goto out; } } dir = pol->sadb_x_policy_dir - 1; memset(&sel, 0, sizeof(sel)); /* set source address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC - 1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); /* set destination address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); rq = (struct sadb_x_ipsecrequest *)(pol + 1); /* extract ipsecrequests */ i = 0; len = pol->sadb_x_policy_len * 8 - sizeof(struct sadb_x_policy); while (len > 0 && i < XFRM_MAX_DEPTH) { ret = ipsecrequests_to_migrate(rq, len, &m[i]); if (ret < 0) { err = ret; goto out; } else { rq = (struct sadb_x_ipsecrequest *)((u8 *)rq + ret); len -= ret; i++; } } if (!i || len > 0) { err = -EINVAL; goto out; } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, kma ? &k : NULL, net, NULL, 0, NULL); out: return err; } #else static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -ENOPROTOOPT; } #endif static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int dir; int err = 0, delete; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) return -EINVAL; dir = xfrm_policy_id2dir(pol->sadb_x_policy_id); if (dir >= XFRM_POLICY_MAX) return -EINVAL; delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); xp = xfrm_policy_byid(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id, delete, &err); if (xp == NULL) return -ENOENT; if (delete) { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 1; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, dir, &c); } else { err = key_pol_get_resp(sk, xp, hdr, dir); } out: xfrm_pol_put(xp); return err; } static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_X_SPDDUMP; out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sp(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_policy_walk(net, &pfk->dump.u.policy, dump_sp, (void *) pfk); } static void pfkey_dump_sp_done(struct pfkey_sock *pfk) { struct net *net = sock_net((struct sock *)pfk); xfrm_policy_walk_done(&pfk->dump.u.policy, net); } static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sp; pfk->dump.done = pfkey_dump_sp_done; xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int key_notify_policy_flush(const struct km_event *c) { struct sk_buff *skb_out; struct sadb_msg *hdr; skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb_out) return -ENOBUFS; hdr = skb_put(skb_out, sizeof(struct sadb_msg)); hdr->sadb_msg_type = SADB_X_SPDFLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct km_event c; int err, err2; err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - old silent behavior */ return 0; return err; } c.data.type = XFRM_POLICY_TYPE_MAIN; c.event = XFRM_MSG_FLUSHPOLICY; c.portid = hdr->sadb_msg_pid; c.seq = hdr->sadb_msg_seq; c.net = net; km_policy_notify(NULL, 0, &c); return 0; } typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs); static const pfkey_handler pfkey_funcs[SADB_MAX + 1] = { [SADB_RESERVED] = pfkey_reserved, [SADB_GETSPI] = pfkey_getspi, [SADB_UPDATE] = pfkey_add, [SADB_ADD] = pfkey_add, [SADB_DELETE] = pfkey_delete, [SADB_GET] = pfkey_get, [SADB_ACQUIRE] = pfkey_acquire, [SADB_REGISTER] = pfkey_register, [SADB_EXPIRE] = NULL, [SADB_FLUSH] = pfkey_flush, [SADB_DUMP] = pfkey_dump, [SADB_X_PROMISC] = pfkey_promisc, [SADB_X_PCHANGE] = NULL, [SADB_X_SPDUPDATE] = pfkey_spdadd, [SADB_X_SPDADD] = pfkey_spdadd, [SADB_X_SPDDELETE] = pfkey_spddelete, [SADB_X_SPDGET] = pfkey_spdget, [SADB_X_SPDACQUIRE] = NULL, [SADB_X_SPDDUMP] = pfkey_spddump, [SADB_X_SPDFLUSH] = pfkey_spdflush, [SADB_X_SPDSETIDX] = pfkey_spdadd, [SADB_X_SPDDELETE2] = pfkey_spdget, [SADB_X_MIGRATE] = pfkey_migrate, }; static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr) { void *ext_hdrs[SADB_EXT_MAX]; int err; /* Non-zero return value of pfkey_broadcast() does not always signal * an error and even on an actual error we may still want to process * the message so rather ignore the return value. */ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); err = parse_exthdrs(skb, hdr, ext_hdrs); if (!err) { err = -EOPNOTSUPP; if (pfkey_funcs[hdr->sadb_msg_type]) err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs); } return err; } static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp) { struct sadb_msg *hdr = NULL; if (skb->len < sizeof(*hdr)) { *errp = -EMSGSIZE; } else { hdr = (struct sadb_msg *) skb->data; if (hdr->sadb_msg_version != PF_KEY_V2 || hdr->sadb_msg_reserved != 0 || (hdr->sadb_msg_type <= SADB_RESERVED || hdr->sadb_msg_type > SADB_MAX)) { hdr = NULL; *errp = -EINVAL; } else if (hdr->sadb_msg_len != (skb->len / sizeof(uint64_t)) || hdr->sadb_msg_len < (sizeof(struct sadb_msg) / sizeof(uint64_t))) { hdr = NULL; *errp = -EMSGSIZE; } else { *errp = 0; } } return hdr; } static inline int aalg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->aalgos) * 8) return 0; return (t->aalgos >> id) & 1; } static inline int ealg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->ealgos) * 8) return 0; return (t->ealgos >> id) & 1; } static int count_ah_combs(const struct xfrm_tmpl *t) { int i, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); } static int count_esp_combs(const struct xfrm_tmpl *t) { int i, k, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg) && aalg->available) { struct sadb_comb *c; c = skb_put_zero(skb, sizeof(struct sadb_comb)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i=0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; for (k = 1; ; k++) { struct sadb_comb *c; const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (!(aalg_tmpl_set(t, aalg) && aalg->available)) continue; c = skb_put(skb, sizeof(struct sadb_comb)); memset(c, 0, sizeof(*c)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_encrypt = ealg->desc.sadb_alg_id; c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits; c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) { return 0; } static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int hard; int hsc; hard = c->data.hard; if (hard) hsc = 2; else hsc = 1; out_skb = pfkey_xfrm_state2msg_expire(x, hsc); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; out_hdr->sadb_msg_type = SADB_EXPIRE; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); return 0; } static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c) { struct net *net = x ? xs_net(x) : c->net; struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); if (atomic_read(&net_pfkey->socks_nr) == 0) return 0; switch (c->event) { case XFRM_MSG_EXPIRE: return key_notify_sa_expire(x, c); case XFRM_MSG_DELSA: case XFRM_MSG_NEWSA: case XFRM_MSG_UPDSA: return key_notify_sa(x, c); case XFRM_MSG_FLUSHSA: return key_notify_sa_flush(c); case XFRM_MSG_NEWAE: /* not yet supported */ break; default: pr_err("pfkey: Unknown SA event %d\n", c->event); break; } return 0; } static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { if (xp && xp->type != XFRM_POLICY_TYPE_MAIN) return 0; switch (c->event) { case XFRM_MSG_POLEXPIRE: return key_notify_policy_expire(xp, c); case XFRM_MSG_DELPOLICY: case XFRM_MSG_NEWPOLICY: case XFRM_MSG_UPDPOLICY: return key_notify_policy(xp, dir, c); case XFRM_MSG_FLUSHPOLICY: if (c->data.type != XFRM_POLICY_TYPE_MAIN) break; return key_notify_policy_flush(c); default: pr_err("pfkey: Unknown policy event %d\n", c->event); break; } return 0; } static u32 get_acqseq(void) { u32 res; static atomic_t acqseq; do { res = atomic_inc_return(&acqseq); } while (!res); return res; } static bool pfkey_is_alive(const struct km_event *c) { struct netns_pfkey *net_pfkey = net_generic(c->net, pfkey_net_id); struct sock *sk; bool is_alive = false; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { if (pfkey_sk(sk)->registered) { is_alive = true; break; } } rcu_read_unlock(); return is_alive; } static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_x_policy *pol; int sockaddr_size; int size; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; size = sizeof(struct sadb_msg) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_ACQUIRE; hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq = get_acqseq(); hdr->sadb_msg_pid = 0; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = XFRM_POLICY_OUT + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ alg_size = 0; if (x->id.proto == IPPROTO_AH) alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) alg_size = dump_esp_combs(skb, t); hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, u8 *data, int len, int *dir) { struct net *net = sock_net(sk); struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; struct sadb_x_sec_ctx *sec_ctx; switch (sk->sk_family) { case AF_INET: if (opt != IP_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (opt != IPV6_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #endif default: *dir = -EINVAL; return NULL; } *dir = -EINVAL; if (len < sizeof(struct sadb_x_policy) || pol->sadb_x_policy_len*8 > len || pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS || (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND)) return NULL; xp = xfrm_policy_alloc(net, GFP_ATOMIC); if (xp == NULL) { *dir = -ENOBUFS; return NULL; } xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; xp->family = sk->sk_family; xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; /* security context too */ if (len >= (pol->sadb_x_policy_len*8 + sizeof(struct sadb_x_sec_ctx))) { char *p = (char *)pol; struct xfrm_user_sec_ctx *uctx; p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } if ((*dir = verify_sec_ctx_len(p))) goto out; uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_ATOMIC); *dir = security_xfrm_policy_alloc(&xp->security, uctx, GFP_ATOMIC); kfree(uctx); if (*dir) goto out; } *dir = pol->sadb_x_policy_dir-1; return xp; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return NULL; } static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_address *addr; struct sadb_x_nat_t_port *n_port; int sockaddr_size; int size; __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0); struct xfrm_encap_tmpl *natt = NULL; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; if (!satype) return -EINVAL; if (!x->encap) return -EINVAL; natt = x->encap; /* Build an SADB_X_NAT_T_NEW_MAPPING message: * * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) | * ADDRESS_DST (new addr) | NAT_T_DPORT (new port) */ size = sizeof(struct sadb_msg) + sizeof(struct sadb_sa) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + (sizeof(struct sadb_x_nat_t_port) * 2); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING; hdr->sadb_msg_satype = satype; hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq; hdr->sadb_msg_pid = 0; /* SA */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = 0; sa->sadb_sa_state = 0; sa->sadb_sa_auth = 0; sa->sadb_sa_encrypt = 0; sa->sadb_sa_flags = 0; /* ADDRESS_SRC (old addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_SPORT (old port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* ADDRESS_DST (new addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(ipaddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_DPORT (new port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE static int set_sadb_address(struct sk_buff *skb, int sasize, int type, const struct xfrm_selector *sel) { struct sadb_address *addr; addr = skb_put(skb, sizeof(struct sadb_address) + sasize); addr->sadb_address_len = (sizeof(struct sadb_address) + sasize)/8; addr->sadb_address_exttype = type; addr->sadb_address_proto = sel->proto; addr->sadb_address_reserved = 0; switch (type) { case SADB_EXT_ADDRESS_SRC: addr->sadb_address_prefixlen = sel->prefixlen_s; pfkey_sockaddr_fill(&sel->saddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; case SADB_EXT_ADDRESS_DST: addr->sadb_address_prefixlen = sel->prefixlen_d; pfkey_sockaddr_fill(&sel->daddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; default: return -EINVAL; } return 0; } static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k) { struct sadb_x_kmaddress *kma; u8 *sa; int family = k->family; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = (sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(family)); kma = skb_put_zero(skb, size_req); kma->sadb_x_kmaddress_len = size_req / 8; kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS; kma->sadb_x_kmaddress_reserved = k->reserved; sa = (u8 *)(kma + 1); if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family)) return -EINVAL; return 0; } static int set_ipsecrequest(struct sk_buff *skb, uint8_t proto, uint8_t mode, int level, uint32_t reqid, uint8_t family, const xfrm_address_t *src, const xfrm_address_t *dst) { struct sadb_x_ipsecrequest *rq; u8 *sa; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(family); rq = skb_put_zero(skb, size_req); rq->sadb_x_ipsecrequest_len = size_req; rq->sadb_x_ipsecrequest_proto = proto; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = level; rq->sadb_x_ipsecrequest_reqid = reqid; sa = (u8 *) (rq + 1); if (!pfkey_sockaddr_fill(src, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(dst, 0, (struct sockaddr *)(sa + socklen), family)) return -EINVAL; return 0; } #endif #ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { int i; int sasize_sel; int size = 0; int size_pol = 0; struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_x_policy *pol; const struct xfrm_migrate *mp; if (type != XFRM_POLICY_TYPE_MAIN) return 0; if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH) return -EINVAL; if (k != NULL) { /* addresses for KM */ size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(k->family)); } /* selector */ sasize_sel = pfkey_sockaddr_size(sel->family); if (!sasize_sel) return -EINVAL; size += (sizeof(struct sadb_address) + sasize_sel) * 2; /* policy info */ size_pol += sizeof(struct sadb_x_policy); /* ipsecrequests */ for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->old_family); /* new locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->new_family); } size += sizeof(struct sadb_msg) + size_pol; /* alloc buffer */ skb = alloc_skb(size, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_MIGRATE; hdr->sadb_msg_satype = pfkey_proto2satype(m->proto); hdr->sadb_msg_len = size / 8; hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = 0; hdr->sadb_msg_pid = 0; /* Addresses to be used by KM for negotiation, if ext is available */ if (k != NULL && (set_sadb_kmaddress(skb, k) < 0)) goto err; /* selector src */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel); /* selector dst */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_DST, sel); /* policy information */ pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = size_pol / 8; pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = dir + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = 0; pol->sadb_x_policy_priority = 0; for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old ipsecrequest */ int mode = pfkey_mode_from_xfrm(mp->mode); if (mode < 0) goto err; if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->old_family, &mp->old_saddr, &mp->old_daddr) < 0) goto err; /* new ipsecrequest */ if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->new_family, &mp->new_saddr, &mp->new_daddr) < 0) goto err; } /* broadcast migrate message to sockets */ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; err: kfree_skb(skb); return -EINVAL; } #else static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; } #endif static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sadb_msg *hdr = NULL; int err; struct net *net = sock_net(sk); err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) goto out; err = -EMSGSIZE; if ((unsigned int)len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = alloc_skb(len, GFP_KERNEL); if (skb == NULL) goto out; err = -EFAULT; if (memcpy_from_msg(skb_put(skb,len), msg, len)) goto out; hdr = pfkey_get_base_msg(skb, &err); if (!hdr) goto out; mutex_lock(&net->xfrm.xfrm_cfg_mutex); err = pfkey_process(sk, skb, hdr); mutex_unlock(&net->xfrm.xfrm_cfg_mutex); out: if (err && hdr && pfkey_error(hdr, err, sk) == 0) err = 0; kfree_skb(skb); return err ? : len; } static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *skb; int copied, err; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) goto out; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; sock_recv_cmsgs(msg, sk, skb); err = (flags & MSG_TRUNC) ? skb->len : copied; if (pfk->dump.dump != NULL && 3 * atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) pfkey_do_dump(pfk); out_free: skb_free_datagram(sk, skb); out: return err; } static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, /* Now the operations that really occur. */ .release = pfkey_release, .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; static const struct net_proto_family pfkey_family_ops = { .family = PF_KEY, .create = pfkey_create, .owner = THIS_MODULE, }; #ifdef CONFIG_PROC_FS static int pfkey_seq_show(struct seq_file *f, void *v) { struct sock *s = sk_entry(v); if (v == SEQ_START_TOKEN) seq_printf(f ,"sk RefCnt Rmem Wmem User Inode\n"); else seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), from_kuid_munged(seq_user_ns(f), sock_i_uid(s)), sock_i_ino(s) ); return 0; } static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos) __acquires(rcu) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); rcu_read_lock(); return seq_hlist_start_head_rcu(&net_pfkey->table, *ppos); } static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); return seq_hlist_next_rcu(v, &net_pfkey->table, ppos); } static void pfkey_seq_stop(struct seq_file *f, void *v) __releases(rcu) { rcu_read_unlock(); } static const struct seq_operations pfkey_seq_ops = { .start = pfkey_seq_start, .next = pfkey_seq_next, .stop = pfkey_seq_stop, .show = pfkey_seq_show, }; static int __net_init pfkey_init_proc(struct net *net) { struct proc_dir_entry *e; e = proc_create_net("pfkey", 0, net->proc_net, &pfkey_seq_ops, sizeof(struct seq_net_private)); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit pfkey_exit_proc(struct net *net) { remove_proc_entry("pfkey", net->proc_net); } #else static inline int pfkey_init_proc(struct net *net) { return 0; } static inline void pfkey_exit_proc(struct net *net) { } #endif static struct xfrm_mgr pfkeyv2_mgr = { .notify = pfkey_send_notify, .acquire = pfkey_send_acquire, .compile_policy = pfkey_compile_policy, .new_mapping = pfkey_send_new_mapping, .notify_policy = pfkey_send_policy_notify, .migrate = pfkey_send_migrate, .is_alive = pfkey_is_alive, }; static int __net_init pfkey_net_init(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); int rv; INIT_HLIST_HEAD(&net_pfkey->table); atomic_set(&net_pfkey->socks_nr, 0); rv = pfkey_init_proc(net); return rv; } static void __net_exit pfkey_net_exit(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_exit_proc(net); WARN_ON(!hlist_empty(&net_pfkey->table)); } static struct pernet_operations pfkey_net_ops = { .init = pfkey_net_init, .exit = pfkey_net_exit, .id = &pfkey_net_id, .size = sizeof(struct netns_pfkey), }; static void __exit ipsec_pfkey_exit(void) { xfrm_unregister_km(&pfkeyv2_mgr); sock_unregister(PF_KEY); unregister_pernet_subsys(&pfkey_net_ops); proto_unregister(&key_proto); } static int __init ipsec_pfkey_init(void) { int err = proto_register(&key_proto, 0); if (err != 0) goto out; err = register_pernet_subsys(&pfkey_net_ops); if (err != 0) goto out_unregister_key_proto; err = sock_register(&pfkey_family_ops); if (err != 0) goto out_unregister_pernet; xfrm_register_km(&pfkeyv2_mgr); out: return err; out_unregister_pernet: unregister_pernet_subsys(&pfkey_net_ops); out_unregister_key_proto: proto_unregister(&key_proto); goto out; } module_init(ipsec_pfkey_init); module_exit(ipsec_pfkey_exit); MODULE_DESCRIPTION("PF_KEY socket helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KEY);
11 10 35 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/leds.h> #include "ieee80211_i.h" #define MAC80211_BLINK_DELAY 50 /* ms */ static inline void ieee80211_led_rx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->rx_led_active)) return; led_trigger_blink_oneshot(&local->rx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } static inline void ieee80211_led_tx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->tx_led_active)) return; led_trigger_blink_oneshot(&local->tx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } #ifdef CONFIG_MAC80211_LEDS void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); void ieee80211_led_radio(struct ieee80211_local *local, bool enabled); void ieee80211_alloc_led_names(struct ieee80211_local *local); void ieee80211_free_led_names(struct ieee80211_local *local); void ieee80211_led_init(struct ieee80211_local *local); void ieee80211_led_exit(struct ieee80211_local *local); void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off); #else static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { } static inline void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { } static inline void ieee80211_alloc_led_names(struct ieee80211_local *local) { } static inline void ieee80211_free_led_names(struct ieee80211_local *local) { } static inline void ieee80211_led_init(struct ieee80211_local *local) { } static inline void ieee80211_led_exit(struct ieee80211_local *local) { } static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { } #endif static inline void ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } static inline void ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif }
14 190 16 5 6 5 62 61 30 64 65 65 65 11 121 34 29 59 28 5 1 13 13 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #ifndef _LINUX_SKMSG_H #define _LINUX_SKMSG_H #include <linux/bpf.h> #include <linux/filter.h> #include <linux/scatterlist.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp.h> #include <net/strparser.h> #define MAX_MSG_FRAGS MAX_SKB_FRAGS #define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, __SK_NONE, }; struct sk_msg_sg { u32 start; u32 curr; u32 end; u32 size; u32 copybreak; DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; void *data_end; u32 apply_bytes; u32 cork_bytes; u32 flags; struct sk_buff *skb; struct sock *sk_redir; struct sock *sk; struct list_head list; }; struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; struct bpf_link *msg_parser_link; struct bpf_link *stream_parser_link; struct bpf_link *stream_verdict_link; struct bpf_link *skb_verdict_link; }; enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { struct list_head list; struct bpf_map *map; void *link_raw; }; struct sk_psock_work_state { u32 len; u32 off; }; struct sk_psock { struct sock *sk; struct sock *sk_redir; u32 apply_bytes; u32 cork_bytes; u32 eval; bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; refcount_t refcnt; void (*saved_unhash)(struct sock *sk); void (*saved_destroy)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); /* psock_update_sk_prot may be called with restore=false many times * so the handler must be safe for this case. It will be called * exactly once with restore=true when the psock is being destroyed * and psock refcnt is zero, but before an RCU grace period. */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; struct delayed_work work; struct sock *sk_pair; struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { WARN_ON(i == msg->sg.end && bytes); } static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) { if (psock->apply_bytes) { if (psock->apply_bytes < bytes) psock->apply_bytes = 0; else psock->apply_bytes -= bytes; } } static inline u32 sk_msg_iter_dist(u32 start, u32 end) { return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) #define sk_msg_iter_var_next(var) \ do { \ var++; \ if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) #define sk_msg_iter_prev(msg, which) \ sk_msg_iter_var_prev(msg->sg.which) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, int which, u32 size) { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) { memcpy(dst, src, sizeof(*src)); sk_msg_init(src); } static inline bool sk_msg_full(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; } static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) { return msg->sg.data[which]; } static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); } static inline bool sk_msg_to_ingress(const struct sk_msg *msg) { return msg->flags & BPF_F_INGRESS; } static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { msg->data = sg_virt(sge); msg->data_end = msg->data + sge->length; } } static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, u32 len, u32 offset) { struct scatterlist *sge; get_page(page); sge = sk_msg_elem(msg, msg->sg.end); sg_set_page(sge, page, len, offset); sg_unmark_end(sge); __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) __set_bit(i, msg->sg.copy); else __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; } while (1); } static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, true); } static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, false); } static inline struct sk_psock *sk_psock(const struct sock *sk) { return __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { set_bit(bit, &psock->state); } static inline void sk_psock_clear_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { clear_bit(bit, &psock->state); } static inline bool sk_psock_test_state(const struct sk_psock *psock, enum sk_psock_state_bits bit) { return test_bit(bit, &psock->state); } static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); kfree_skb(skb); } static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); else { sk_msg_free(psock->sk, msg); kfree(msg); } spin_unlock_bh(&psock->ingress_lock); } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); if (msg) list_del(&msg->list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, struct sk_msg *msg) { struct sk_msg *ret; spin_lock_bh(&psock->ingress_lock); if (list_is_last(&msg->list, &psock->ingress_msg)) ret = NULL; else ret = list_next_entry(msg, list); spin_unlock_bh(&psock->ingress_lock); return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) { return psock ? list_empty(&psock->ingress_msg) : true; } static inline void kfree_sk_msg(struct sk_msg *msg) { if (msg->skb) consume_skb(msg->skb); kfree(msg); } static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; sk->sk_err = err; sk_error_report(sk); } struct sk_psock *sk_psock_init(struct sock *sk, int node); void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); #else static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { return -EOPNOTSUPP; } static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { } static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { } #endif void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define sk_psock_init_link() \ ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ GFP_ATOMIC | __GFP_NOWARN)) static inline void sk_psock_free_link(struct sk_psock_link *link) { kfree(link); } struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { sk_msg_free(psock->sk, psock->cork); kfree(psock->cork); psock->cork = NULL; } } static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, true); } static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock && !refcount_inc_not_zero(&psock->refcnt)) psock = NULL; rcu_read_unlock(); return psock; } void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) { if (refcount_dec_and_test(&psock->refcnt)) sk_psock_drop(sk, psock); } static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { read_lock_bh(&sk->sk_callback_lock); if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); read_unlock_bh(&sk->sk_callback_lock); } static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { prog = xchg(pprog, prog); if (prog) bpf_prog_put(prog); } static inline int psock_replace_prog(struct bpf_prog **pprog, struct bpf_prog *prog, struct bpf_prog *old) { if (cmpxchg(pprog, old, prog) != old) return -ENOENT; if (old) bpf_prog_put(old); return 0; } static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; return !!psock->saved_data_ready; } #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) /* We only have two bits so far. */ #define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER) static inline bool skb_bpf_strparser(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_STRPARSER; } static inline void skb_bpf_set_strparser(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_STRPARSER; } static inline bool skb_bpf_ingress(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_INGRESS; } static inline void skb_bpf_set_ingress(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_INGRESS; } static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, bool ingress) { skb->_sk_redir = (unsigned long)sk_redir; if (ingress) skb->_sk_redir |= BPF_F_INGRESS; } static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return (struct sock *)(sk_redir & BPF_F_PTR_MASK); } static inline void skb_bpf_redirect_clear(struct sk_buff *skb) { skb->_sk_redir = 0; } #endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */
52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (C) 2019 Netronome Systems, Inc. */ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/snmp.h> #include <net/tls.h> #include "tls.h" #ifdef CONFIG_PROC_FS static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW), SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE), SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE), SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW), SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW), SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE), SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY), SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL), SNMP_MIB_SENTINEL }; static int tls_statistics_seq_show(struct seq_file *seq, void *v) { unsigned long buf[LINUX_MIB_TLSMAX] = {}; struct net *net = seq->private; int i; snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); for (i = 0; tls_mib_list[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); return 0; } #endif int __net_init tls_proc_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net_single("tls_stat", 0444, net->proc_net, tls_statistics_seq_show, NULL)) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } void __net_exit tls_proc_fini(struct net *net) { remove_proc_entry("tls_stat", net->proc_net); }
291 289 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 // SPDX-License-Identifier: GPL-2.0 /* * High memory handling common code and variables. * * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de * * * Redesigned the x86 32-bit VM architecture to deal with * 64-bit physical space. With current x86 CPUs this * means up to 64 Gigabytes physical RAM. * * Rewrote high memory support to move the page cache into * high memory. Implemented permanent (schedulable) kmaps * based on Linus' idea. * * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> */ #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/pagemap.h> #include <linux/mempool.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> #include <linux/kgdb.h> #include <asm/tlbflush.h> #include <linux/vmalloc.h> #ifdef CONFIG_KMAP_LOCAL static inline int kmap_local_calc_idx(int idx) { return idx + KM_MAX_IDX * smp_processor_id(); } #ifndef arch_kmap_local_map_idx #define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) #endif #endif /* CONFIG_KMAP_LOCAL */ /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped * since a TLB flush - it is usable. * 1 means that there are no users, but it has been mapped * since the last TLB flush - so we can't use it. * n means that there are (n-1) current users of it. */ #ifdef CONFIG_HIGHMEM /* * Architecture with aliasing data cache may define the following family of * helper functions in its asm/highmem.h to control cache color of virtual * addresses where physical memory pages are mapped by kmap. */ #ifndef get_pkmap_color /* * Determine color of virtual address where the page should be mapped. */ static inline unsigned int get_pkmap_color(struct page *page) { return 0; } #define get_pkmap_color get_pkmap_color /* * Get next index for mapping inside PKMAP region for page with given color. */ static inline unsigned int get_next_pkmap_nr(unsigned int color) { static unsigned int last_pkmap_nr; last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; return last_pkmap_nr; } /* * Determine if page index inside PKMAP region (pkmap_nr) of given color * has wrapped around PKMAP region end. When this happens an attempt to * flush all unused PKMAP slots is made. */ static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) { return pkmap_nr == 0; } /* * Get the number of PKMAP entries of the given color. If no free slot is * found after checking that many entries, kmap will sleep waiting for * someone to call kunmap and free PKMAP slot. */ static inline int get_pkmap_entries_count(unsigned int color) { return LAST_PKMAP; } /* * Get head of a wait queue for PKMAP entries of the given color. * Wait queues for different mapping colors should be independent to avoid * unnecessary wakeups caused by freeing of slots of other colors. */ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) { static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); return &pkmap_map_wait; } #endif unsigned long __nr_free_highpages(void) { unsigned long pages = 0; struct zone *zone; for_each_populated_zone(zone) { if (is_highmem(zone)) pages += zone_page_state(zone, NR_FREE_PAGES); } return pages; } unsigned long __totalhigh_pages(void) { unsigned long pages = 0; struct zone *zone; for_each_populated_zone(zone) { if (is_highmem(zone)) pages += zone_managed_pages(zone); } return pages; } EXPORT_SYMBOL(__totalhigh_pages); static int pkmap_count[LAST_PKMAP]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); pte_t *pkmap_page_table; /* * Most architectures have no use for kmap_high_get(), so let's abstract * the disabling of IRQ out of the locking in that case to save on a * potential useless overhead. */ #ifdef ARCH_NEEDS_KMAP_HIGH_GET #define lock_kmap() spin_lock_irq(&kmap_lock) #define unlock_kmap() spin_unlock_irq(&kmap_lock) #define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags) #define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags) #else #define lock_kmap() spin_lock(&kmap_lock) #define unlock_kmap() spin_unlock(&kmap_lock) #define lock_kmap_any(flags) \ do { spin_lock(&kmap_lock); (void)(flags); } while (0) #define unlock_kmap_any(flags) \ do { spin_unlock(&kmap_lock); (void)(flags); } while (0) #endif struct page *__kmap_to_page(void *vaddr) { unsigned long base = (unsigned long) vaddr & PAGE_MASK; struct kmap_ctrl *kctrl = &current->kmap_ctrl; unsigned long addr = (unsigned long)vaddr; int i; /* kmap() mappings */ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP))) return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)])); /* kmap_local_page() mappings */ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && base < __fix_to_virt(FIX_KMAP_BEGIN))) { for (i = 0; i < kctrl->idx; i++) { unsigned long base_addr; int idx; idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); if (base_addr == base) return pte_page(kctrl->pteval[i]); } } return virt_to_page(vaddr); } EXPORT_SYMBOL(__kmap_to_page); static void flush_all_zero_pkmaps(void) { int i; int need_flush = 0; flush_cache_kmaps(); for (i = 0; i < LAST_PKMAP; i++) { struct page *page; pte_t ptent; /* * zero means we don't have anything to do, * >1 means that it is still in use. Only * a count of 1 means that it is free but * needs to be unmapped */ if (pkmap_count[i] != 1) continue; pkmap_count[i] = 0; /* sanity check */ ptent = ptep_get(&pkmap_page_table[i]); BUG_ON(pte_none(ptent)); /* * Don't need an atomic fetch-and-clear op here; * no-one has the page mapped, and cannot get at * its virtual address (and hence PTE) without first * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ page = pte_page(ptent); pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); need_flush = 1; } if (need_flush) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } void __kmap_flush_unused(void) { lock_kmap(); flush_all_zero_pkmaps(); unlock_kmap(); } static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; int count; unsigned int last_pkmap_nr; unsigned int color = get_pkmap_color(page); start: count = get_pkmap_entries_count(color); /* Find an empty entry */ for (;;) { last_pkmap_nr = get_next_pkmap_nr(color); if (no_more_pkmaps(last_pkmap_nr, color)) { flush_all_zero_pkmaps(); count = get_pkmap_entries_count(color); } if (!pkmap_count[last_pkmap_nr]) break; /* Found a usable entry */ if (--count) continue; /* * Sleep for somebody else to unmap their entries */ { DECLARE_WAITQUEUE(wait, current); wait_queue_head_t *pkmap_map_wait = get_pkmap_wait_queue_head(color); __set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(pkmap_map_wait, &wait); unlock_kmap(); schedule(); remove_wait_queue(pkmap_map_wait, &wait); lock_kmap(); /* Somebody else might have mapped it while we slept */ if (page_address(page)) return (unsigned long)page_address(page); /* Re-start */ goto start; } } vaddr = PKMAP_ADDR(last_pkmap_nr); set_pte_at(&init_mm, vaddr, &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); pkmap_count[last_pkmap_nr] = 1; set_page_address(page, (void *)vaddr); return vaddr; } /** * kmap_high - map a highmem page into memory * @page: &struct page to map * * Returns the page's virtual memory address. * * We cannot call this from interrupts, as it may block. */ void *kmap_high(struct page *page) { unsigned long vaddr; /* * For highmem pages, we can't trust "virtual" until * after we have the lock. */ lock_kmap(); vaddr = (unsigned long)page_address(page); if (!vaddr) vaddr = map_new_virtual(page); pkmap_count[PKMAP_NR(vaddr)]++; BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); unlock_kmap(); return (void *) vaddr; } EXPORT_SYMBOL(kmap_high); #ifdef ARCH_NEEDS_KMAP_HIGH_GET /** * kmap_high_get - pin a highmem page into memory * @page: &struct page to pin * * Returns the page's current virtual memory address, or NULL if no mapping * exists. If and only if a non null address is returned then a * matching call to kunmap_high() is necessary. * * This can be called from any context. */ void *kmap_high_get(struct page *page) { unsigned long vaddr, flags; lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); if (vaddr) { BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1); pkmap_count[PKMAP_NR(vaddr)]++; } unlock_kmap_any(flags); return (void *) vaddr; } #endif /** * kunmap_high - unmap a highmem page into memory * @page: &struct page to unmap * * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called * only from user context. */ void kunmap_high(struct page *page) { unsigned long vaddr; unsigned long nr; unsigned long flags; int need_wakeup; unsigned int color = get_pkmap_color(page); wait_queue_head_t *pkmap_map_wait; lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); BUG_ON(!vaddr); nr = PKMAP_NR(vaddr); /* * A count must never go down to zero * without a TLB flush! */ need_wakeup = 0; switch (--pkmap_count[nr]) { case 0: BUG(); case 1: /* * Avoid an unnecessary wake_up() function call. * The common case is pkmap_count[] == 1, but * no waiters. * The tasks queued in the wait-queue are guarded * by both the lock in the wait-queue-head and by * the kmap_lock. As the kmap_lock is held here, * no need for the wait-queue-head's lock. Simply * test if the queue is empty. */ pkmap_map_wait = get_pkmap_wait_queue_head(color); need_wakeup = waitqueue_active(pkmap_map_wait); } unlock_kmap_any(flags); /* do wake-up, if needed, race-free outside of the spin lock */ if (need_wakeup) wake_up(pkmap_map_wait); } EXPORT_SYMBOL(kunmap_high); void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (start1 >= end1) start1 = end1 = 0; if (start2 >= end2) start2 = end2 = 0; for (i = 0; i < compound_nr(page); i++) { void *kaddr = NULL; if (start1 >= PAGE_SIZE) { start1 -= PAGE_SIZE; end1 -= PAGE_SIZE; } else { unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); if (end1 > start1) { kaddr = kmap_local_page(page + i); memset(kaddr + start1, 0, this_end - start1); } end1 -= this_end; start1 = 0; } if (start2 >= PAGE_SIZE) { start2 -= PAGE_SIZE; end2 -= PAGE_SIZE; } else { unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); if (end2 > start2) { if (!kaddr) kaddr = kmap_local_page(page + i); memset(kaddr + start2, 0, this_end - start2); } end2 -= this_end; start2 = 0; } if (kaddr) { kunmap_local(kaddr); flush_dcache_page(page + i); } if (!end1 && !end2) break; } BUG_ON((start1 | start2 | end1 | end2) != 0); } EXPORT_SYMBOL(zero_user_segments); #endif /* CONFIG_HIGHMEM */ #ifdef CONFIG_KMAP_LOCAL #include <asm/kmap_size.h> /* * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second * slot is unused which acts as a guard page */ #ifdef CONFIG_DEBUG_KMAP_LOCAL # define KM_INCR 2 #else # define KM_INCR 1 #endif static inline int kmap_local_idx_push(void) { WARN_ON_ONCE(in_hardirq() && !irqs_disabled()); current->kmap_ctrl.idx += KM_INCR; BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); return current->kmap_ctrl.idx - 1; } static inline int kmap_local_idx(void) { return current->kmap_ctrl.idx - 1; } static inline void kmap_local_idx_pop(void) { current->kmap_ctrl.idx -= KM_INCR; BUG_ON(current->kmap_ctrl.idx < 0); } #ifndef arch_kmap_local_post_map # define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) #endif #ifndef arch_kmap_local_pre_unmap # define arch_kmap_local_pre_unmap(vaddr) do { } while (0) #endif #ifndef arch_kmap_local_post_unmap # define arch_kmap_local_post_unmap(vaddr) do { } while (0) #endif #ifndef arch_kmap_local_unmap_idx #define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) #endif #ifndef arch_kmap_local_high_get static inline void *arch_kmap_local_high_get(struct page *page) { return NULL; } #endif #ifndef arch_kmap_local_set_pte #define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \ set_pte_at(mm, vaddr, ptep, ptev) #endif /* Unmap a local mapping which was obtained by kmap_high_get() */ static inline bool kmap_high_unmap_local(unsigned long vaddr) { #ifdef ARCH_NEEDS_KMAP_HIGH_GET if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)]))); return true; } #endif return false; } static pte_t *__kmap_pte; static pte_t *kmap_get_pte(unsigned long vaddr, int idx) { if (IS_ENABLED(CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY)) /* * Set by the arch if __kmap_pte[-idx] does not produce * the correct entry. */ return virt_to_kpte(vaddr); if (!__kmap_pte) __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); return &__kmap_pte[-idx]; } void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) { pte_t pteval, *kmap_pte; unsigned long vaddr; int idx; /* * Disable migration so resulting virtual address is stable * across preemption. */ migrate_disable(); preempt_disable(); idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); kmap_pte = kmap_get_pte(vaddr, idx); BUG_ON(!pte_none(ptep_get(kmap_pte))); pteval = pfn_pte(pfn, prot); arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); arch_kmap_local_post_map(vaddr, pteval); current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; preempt_enable(); return (void *)vaddr; } EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot) { void *kmap; /* * To broaden the usage of the actual kmap_local() machinery always map * pages when debugging is enabled and the architecture has no problems * with alias mappings. */ if (!IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) && !PageHighMem(page)) return page_address(page); /* Try kmap_high_get() if architecture has it enabled */ kmap = arch_kmap_local_high_get(page); if (kmap) return kmap; return __kmap_local_pfn_prot(page_to_pfn(page), prot); } EXPORT_SYMBOL(__kmap_local_page_prot); void kunmap_local_indexed(const void *vaddr) { unsigned long addr = (unsigned long) vaddr & PAGE_MASK; pte_t *kmap_pte; int idx; if (addr < __fix_to_virt(FIX_KMAP_END) || addr > __fix_to_virt(FIX_KMAP_BEGIN)) { if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP)) { /* This _should_ never happen! See above. */ WARN_ON_ONCE(1); return; } /* * Handle mappings which were obtained by kmap_high_get() * first as the virtual address of such mappings is below * PAGE_OFFSET. Warn for all other addresses which are in * the user space part of the virtual address space. */ if (!kmap_high_unmap_local(addr)) WARN_ON_ONCE(addr < PAGE_OFFSET); return; } preempt_disable(); idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); kmap_pte = kmap_get_pte(addr, idx); arch_kmap_local_pre_unmap(addr); pte_clear(&init_mm, addr, kmap_pte); arch_kmap_local_post_unmap(addr); current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); kmap_local_idx_pop(); preempt_enable(); migrate_enable(); } EXPORT_SYMBOL(kunmap_local_indexed); /* * Invoked before switch_to(). This is safe even when during or after * clearing the maps an interrupt which needs a kmap_local happens because * the task::kmap_ctrl.idx is not modified by the unmapping code so a * nested kmap_local will use the next unused index and restore the index * on unmap. The already cleared kmaps of the outgoing task are irrelevant * because the interrupt context does not know about them. The same applies * when scheduling back in for an interrupt which happens before the * restore is complete. */ void __kmap_local_sched_out(void) { struct task_struct *tsk = current; pte_t *kmap_pte; int i; /* Clear kmaps */ for (i = 0; i < tsk->kmap_ctrl.idx; i++) { pte_t pteval = tsk->kmap_ctrl.pteval[i]; unsigned long addr; int idx; /* With debug all even slots are unmapped and act as guard */ if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { WARN_ON_ONCE(pte_val(pteval) != 0); continue; } if (WARN_ON_ONCE(pte_none(pteval))) continue; /* * This is a horrible hack for XTENSA to calculate the * coloured PTE index. Uses the PFN encoded into the pteval * and the map index calculation because the actual mapped * virtual address is not stored in task::kmap_ctrl. * For any sane architecture this is optimized out. */ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); kmap_pte = kmap_get_pte(addr, idx); arch_kmap_local_pre_unmap(addr); pte_clear(&init_mm, addr, kmap_pte); arch_kmap_local_post_unmap(addr); } } void __kmap_local_sched_in(void) { struct task_struct *tsk = current; pte_t *kmap_pte; int i; /* Restore kmaps */ for (i = 0; i < tsk->kmap_ctrl.idx; i++) { pte_t pteval = tsk->kmap_ctrl.pteval[i]; unsigned long addr; int idx; /* With debug all even slots are unmapped and act as guard */ if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { WARN_ON_ONCE(pte_val(pteval) != 0); continue; } if (WARN_ON_ONCE(pte_none(pteval))) continue; /* See comment in __kmap_local_sched_out() */ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); kmap_pte = kmap_get_pte(addr, idx); set_pte_at(&init_mm, addr, kmap_pte, pteval); arch_kmap_local_post_map(addr, pteval); } } void kmap_local_fork(struct task_struct *tsk) { if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); } #endif #if defined(HASHED_PAGE_VIRTUAL) #define PA_HASH_ORDER 7 /* * Describes one page->virtual association */ struct page_address_map { struct page *page; void *virtual; struct list_head list; }; static struct page_address_map page_address_maps[LAST_PKMAP]; /* * Hash table bucket */ static struct page_address_slot { struct list_head lh; /* List of page_address_maps */ spinlock_t lock; /* Protect this bucket's list */ } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; static struct page_address_slot *page_slot(const struct page *page) { return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; } /** * page_address - get the mapped virtual address of a page * @page: &struct page to get the virtual address of * * Returns the page's virtual address. */ void *page_address(const struct page *page) { unsigned long flags; void *ret; struct page_address_slot *pas; if (!PageHighMem(page)) return lowmem_page_address(page); pas = page_slot(page); ret = NULL; spin_lock_irqsave(&pas->lock, flags); if (!list_empty(&pas->lh)) { struct page_address_map *pam; list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { ret = pam->virtual; break; } } } spin_unlock_irqrestore(&pas->lock, flags); return ret; } EXPORT_SYMBOL(page_address); /** * set_page_address - set a page's virtual address * @page: &struct page to set * @virtual: virtual address to use */ void set_page_address(struct page *page, void *virtual) { unsigned long flags; struct page_address_slot *pas; struct page_address_map *pam; BUG_ON(!PageHighMem(page)); pas = page_slot(page); if (virtual) { /* Add */ pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; pam->page = page; pam->virtual = virtual; spin_lock_irqsave(&pas->lock, flags); list_add_tail(&pam->list, &pas->lh); spin_unlock_irqrestore(&pas->lock, flags); } else { /* Remove */ spin_lock_irqsave(&pas->lock, flags); list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { list_del(&pam->list); break; } } spin_unlock_irqrestore(&pas->lock, flags); } } void __init page_address_init(void) { int i; for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); } } #endif /* defined(HASHED_PAGE_VIRTUAL) */
49 49 50 49 50 50 49 50 49 8 8 8 3 15 15 1 14 15 15 8 8 8 16 2 14 50 1 4 1 4 10 10 1 2 2 2 3 2 3 2 1 3 3 3 8 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * "Ping" sockets * * Based on ipv4/udp.c code. * * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), * Pavel Kankovsky (for Linux 2.4.32) * * Pavel gave all rights to bugs to Vasiliy, * none of the bugs are Pavel's now. */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/snmp.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/export.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> #include <net/ping.h> #include <net/udp.h> #include <net/route.h> #include <net/inet_common.h> #include <net/checksum.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/in6.h> #include <linux/icmpv6.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/transp_v6.h> #endif struct ping_table { struct hlist_head hash[PING_HTABLE_SIZE]; spinlock_t lock; }; static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_SYMBOL_GPL(pingv6_ops); static u16 ping_port_rover; static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { u32 res = (num + net_hash_mix(net)) & mask; pr_debug("hash(%u) = %u\n", num, res); return res; } EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) { return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } int ping_get_port(struct sock *sk, unsigned short ident) { struct inet_sock *isk, *isk2; struct hlist_head *hlist; struct sock *sk2 = NULL; isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { u32 i; u16 result = ping_port_rover + 1; for (i = 0; i < (1L << 16); i++, result++) { if (!result) result++; /* avoid zero */ hlist = ping_hashslot(&ping_table, sock_net(sk), result); sk_for_each(sk2, hlist) { isk2 = inet_sk(sk2); if (isk2->inet_num == result) goto next_port; } /* found */ ping_port_rover = ident = result; break; next_port: ; } if (i >= (1L << 16)) goto fail; } else { hlist = ping_hashslot(&ping_table, sock_net(sk), ident); sk_for_each(sk2, hlist) { isk2 = inet_sk(sk2); /* BUG? Why is this reuse and not reuseaddr? ping.c * doesn't turn off SO_REUSEADDR, and it doesn't expect * that other ping processes can steal its packets. */ if ((isk2->inet_num == ident) && (sk2 != sk) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } } pr_debug("found port/ident = %d\n", ident); isk->inet_num = ident; if (sk_unhashed(sk)) { pr_debug("was not hashed\n"); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } spin_unlock(&ping_table.lock); return 0; fail: spin_unlock(&ping_table.lock); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(ping_get_port); int ping_hash(struct sock *sk) { pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ return 0; } void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); spin_lock(&ping_table.lock); if (sk_del_node_init_rcu(sk)) { isk->inet_num = 0; isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } spin_unlock(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_unhash); /* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident); struct sock *sk = NULL; struct inet_sock *isk; int dif, sdif; if (skb->protocol == htons(ETH_P_IP)) { dif = inet_iif(skb); sdif = inet_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", (int)ident, &ip_hdr(skb)->daddr, dif); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { dif = inet6_iif(skb); sdif = inet6_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", (int)ident, &ipv6_hdr(skb)->daddr, dif); #endif } else { return NULL; } sk_for_each_rcu(sk, hslot) { isk = inet_sk(sk); pr_debug("iterate\n"); if (isk->inet_num != ident) continue; if (skb->protocol == htons(ETH_P_IP) && sk->sk_family == AF_INET) { pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, (int) isk->inet_num, &isk->inet_rcv_saddr, sk->sk_bound_dev_if); if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != ip_hdr(skb)->daddr) continue; #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6) && sk->sk_family == AF_INET6) { pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, (int) isk->inet_num, &sk->sk_v6_rcv_saddr, sk->sk_bound_dev_if); if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, &ipv6_hdr(skb)->daddr)) continue; #endif } else { continue; } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && sk->sk_bound_dev_if != sdif) continue; goto exit; } sk = NULL; exit: return sk; } static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, kgid_t *high) { kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); struct group_info *group_info; int i; kgid_t low, high; int ret = 0; if (sk->sk_family == AF_INET6) sk->sk_ipv6only = 1; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; group_info = get_current_groups(); for (i = 0; i < group_info->ngroups; i++) { kgid_t gid = group_info->gid[i]; if (gid_lte(low, gid) && gid_lte(gid, high)) goto out_release_group; } ret = -EACCES; out_release_group: put_group_info(group_info); return ret; } EXPORT_SYMBOL_GPL(ping_init_sock); void ping_close(struct sock *sk, long timeout) { pr_debug("ping_close(sk=%p,sk->num=%u)\n", inet_sk(sk), inet_sk(sk)->inet_num); pr_debug("isk->refcnt = %d\n", refcount_read(&sk->sk_refcnt)); sk_common_release(sk); } EXPORT_SYMBOL_GPL(ping_close); static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len. */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, struct sockaddr *uaddr, int addr_len) { struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; u32 tb_id = RT_TABLE_LOCAL; int chk_addr_ret; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin_family != AF_INET && !(addr->sin_family == AF_UNSPEC && addr->sin_addr.s_addr == htonl(INADDR_ANY))) return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) return 0; tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST || (chk_addr_ret != RTN_LOCAL && !inet_can_nonlocal_bind(net, isk))) return -EADDRNOTAVAIL; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; int addr_type, scoped, has_addr; struct net_device *dev = NULL; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); addr_type = ipv6_addr_type(&addr->sin6_addr); scoped = __ipv6_addr_needs_scope_id(addr_type); if ((addr_type != IPV6_ADDR_ANY && !(addr_type & IPV6_ADDR_UNICAST)) || (scoped && !addr->sin6_scope_id)) return -EINVAL; rcu_read_lock(); if (addr->sin6_scope_id) { dev = dev_get_by_index_rcu(net, addr->sin6_scope_id); if (!dev) { rcu_read_unlock(); return -ENODEV; } } if (!dev && sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { rcu_read_unlock(); return -ENODEV; } } has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, scoped); rcu_read_unlock(); if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; if (scoped) sk->sk_bound_dev_if = addr->sin6_scope_id; #endif } else { return -EAFNOSUPPORT; } return 0; } static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) { if (saddr->sa_family == AF_INET) { struct inet_sock *isk = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) saddr; isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; #if IS_ENABLED(CONFIG_IPV6) } else if (saddr->sa_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; struct ipv6_pinfo *np = inet6_sk(sk); sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr; #endif } } /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. */ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *isk = inet_sk(sk); unsigned short snum; int err; int dif = sk->sk_bound_dev_if; err = ping_check_bind_addr(sk, isk, uaddr, addr_len); if (err) return err; lock_sock(sk); err = -EINVAL; if (isk->inet_num != 0) goto out; err = -EADDRINUSE; snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); if (ping_get_port(sk, snum) != 0) { /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */ sk->sk_bound_dev_if = dif; goto out; } ping_set_saddr(sk, uaddr); pr_debug("after bind(): num = %hu, dif = %d\n", isk->inet_num, sk->sk_bound_dev_if); err = 0; if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; #endif if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; isk->inet_sport = htons(isk->inet_num); isk->inet_daddr = 0; isk->inet_dport = 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); #endif sk_dst_reset(sk); out: release_sock(sk); pr_debug("ping_v4_bind -> %d\n", err); return err; } EXPORT_SYMBOL_GPL(ping_bind); /* * Is this a supported type of ICMP message? */ static inline int ping_supported(int family, int type, int code) { return (family == AF_INET && type == ICMP_ECHO && code == 0) || (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) || (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) || (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0); } /* * This routine is called by the ICMP module when it gets some * sort of error condition. */ void ping_err(struct sk_buff *skb, int offset, u32 info) { int family; struct icmphdr *icmph; struct inet_sock *inet_sock; int type; int code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; int err; if (skb->protocol == htons(ETH_P_IP)) { family = AF_INET; type = icmp_hdr(skb)->type; code = icmp_hdr(skb)->code; icmph = (struct icmphdr *)(skb->data + offset); } else if (skb->protocol == htons(ETH_P_IPV6)) { family = AF_INET6; type = icmp6_hdr(skb)->icmp6_type; code = icmp6_hdr(skb)->icmp6_code; icmph = (struct icmphdr *) (skb->data + offset); } else { BUG(); } /* We assume the packet has already been checked by icmp_unreach */ if (!ping_supported(family, icmph->type, icmph->code)) return; pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n", skb->protocol, type, code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (!sk) { pr_debug("no socket, dropping\n"); return; /* No socket for error */ } pr_debug("err on socket %p\n", sk); err = 0; harderr = 0; inet_sock = inet_sk(sk); if (skb->protocol == htons(ETH_P_IP)) { switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: /* This is not a real error but ping wants to see it. * Report it with some fake errno. */ err = EREMOTEIO; break; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; break; } goto out; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_REDIRECT: /* See ICMP_SOURCE_QUENCH */ ipv4_sk_redirect(skb, sk); err = EREMOTEIO; break; } #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { harderr = pingv6_ops.icmpv6_err_convert(type, code, &err); #endif } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) || (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { if (family == AF_INET) { ip_icmp_error(sk, skb, err, 0 /* no remote port */, info, (u8 *)icmph); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { pingv6_ops.ipv6_icmp_error(sk, skb, err, 0, info, (u8 *)icmph); #endif } } sk->sk_err = err; sk_error_report(sk); out: return; } EXPORT_SYMBOL_GPL(ping_err); /* * Copy and checksum an ICMP Echo packet from user space into a buffer * starting from the payload. */ int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *skb) { struct pingfakehdr *pfh = from; if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, &pfh->msg->msg_iter)) return -EFAULT; #if IS_ENABLED(CONFIG_IPV6) /* For IPv6, checksum each skb as we go along, as expected by * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in * wcheck, it will be finalized in ping_v4_push_pending_frames. */ if (pfh->family == AF_INET6) { skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd); skb->ip_summed = CHECKSUM_NONE; pfh->wcheck = 0; } #endif return 0; } EXPORT_SYMBOL_GPL(ping_getfrag); static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4) { struct sk_buff *skb = skb_peek(&sk->sk_write_queue); if (!skb) return 0; pfh->wcheck = csum_partial((char *)&pfh->icmph, sizeof(struct icmphdr), pfh->wcheck); pfh->icmph.checksum = csum_fold(pfh->wcheck); memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); skb->ip_summed = CHECKSUM_NONE; return ip_push_pending_frames(sk, fl4); } int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len) { u8 type, code; if (len > 0xFFFF) return -EMSGSIZE; /* Must have at least a full ICMP header. */ if (len < icmph_len) return -EINVAL; /* * Check the flags. */ /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* * Fetch the ICMP header provided by the userland. * iovec is modified! The ICMP header is consumed. */ if (memcpy_from_msg(user_icmph, msg, icmph_len)) return -EFAULT; if (family == AF_INET) { type = ((struct icmphdr *) user_icmph)->type; code = ((struct icmphdr *) user_icmph)->code; #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { type = ((struct icmp6hdr *) user_icmph)->icmp6_type; code = ((struct icmp6hdr *) user_icmph)->icmp6_code; #endif } else { BUG(); } if (!ping_supported(family, type, code)) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct net *net = sock_net(sk); struct flowi4 fl4; struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct icmphdr user_icmph; struct pingfakehdr pfh; struct rtable *rt = NULL; struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; u8 tos, scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph, sizeof(user_icmph)); if (err) return err; /* * Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; daddr = usin->sin_addr.s_addr; /* no remote port */ } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; /* no remote port */ } ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; goto out_free; } faddr = ipc.opt->opt.faddr; } tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); pfh.icmph.type = user_icmph.type; /* already checked */ pfh.icmph.code = user_icmph.code; /* ditto */ pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; pfh.msg = msg; pfh.wcheck = 0; pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, sizeof(struct icmphdr), &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else err = ping_v4_push_pending_frames(sk, &pfh, &fl4); release_sock(sk); out: ip_rt_put(rt); out_free: if (free) kfree(ipc.opt); if (!err) { icmp_out_count(sock_net(sk), user_icmph.type); return len; } return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *isk = inet_sk(sk); int family = sk->sk_family; struct sk_buff *skb; int copied, err; pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); err = -EOPNOTSUPP; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return inet_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* Don't bother checking the checksum */ err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address and add cmsg data. */ if (family == AF_INET) { DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); if (sin) { sin->sin_family = AF_INET; sin->sin_port = 0 /* skb->h.uh->source */; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { struct ipv6hdr *ip6 = ipv6_hdr(skb); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; sin6->sin6_addr = ip6->saddr; sin6->sin6_flowinfo = 0; if (inet6_test_bit(SNDFLOW, sk)) sin6->sin6_flowinfo = ip6_flowinfo(ip6); sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); *addr_len = sizeof(*sin6); } if (inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6) && inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); else if (skb->protocol == htons(ETH_P_IP) && inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #endif } else { BUG(); } err = copied; done: skb_free_datagram(sk, skb); out: pr_debug("ping_recvmsg -> %d\n", err); return err; } EXPORT_SYMBOL_GPL(ping_recvmsg); static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { sk_skb_reason_drop(sk, skb, reason); pr_debug("ping_queue_rcv_skb -> failed\n"); return reason; } return SKB_NOT_DROPPED_YET; } int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return __ping_queue_rcv_skb(sk, skb) ? -1 : 0; } EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); /* * All we need to do is get the socket. */ enum skb_drop_reason ping_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NO_SOCKET; struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); /* We assume the packet has already been checked by icmp_rcv */ pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); /* Push ICMP header back */ skb_push(skb, skb->data - (u8 *)icmph); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); pr_debug("rcv on socket %p\n", sk); if (skb2) reason = __ping_queue_rcv_skb(sk, skb2); else reason = SKB_DROP_REASON_NOMEM; } if (reason) pr_debug("no socket, dropping\n"); return reason; } EXPORT_SYMBOL_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, .pre_connect = ping_pre_connect, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = ping_v4_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct inet_sock), }; EXPORT_SYMBOL(ping_prot); #ifdef CONFIG_PROC_FS static struct sock *ping_get_first(struct seq_file *seq, int start) { struct sock *sk; struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; ++state->bucket) { struct hlist_head *hslot; hslot = &ping_table.hash[state->bucket]; if (hlist_empty(hslot)) continue; sk_for_each(sk, hslot) { if (net_eq(sock_net(sk), net) && sk->sk_family == state->family) goto found; } } sk = NULL; found: return sk; } static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) { struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); do { sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net))); if (!sk) return ping_get_first(seq, state->bucket + 1); return sk; } static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = ping_get_first(seq, 0); if (sk) while (pos && (sk = ping_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) __acquires(ping_table.lock) { struct ping_iter_state *state = seq->private; state->bucket = 0; state->family = family; spin_lock(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(ping_seq_start); static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) { return ping_seq_start(seq, pos, AF_INET); } void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = ping_get_idx(seq, 0); else sk = ping_get_next(seq, v); ++*pos; return sk; } EXPORT_SYMBOL_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) __releases(ping_table.lock) { spin_unlock(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct ping_iter_state *state = seq->private; ping_v4_format_sock(v, seq, state->bucket); } seq_pad(seq, '\n'); return 0; } static const struct seq_operations ping_v4_seq_ops = { .start = ping_v4_seq_start, .show = ping_v4_seq_show, .next = ping_seq_next, .stop = ping_seq_stop, }; static int __net_init ping_v4_proc_init_net(struct net *net) { if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; return 0; } static void __net_exit ping_v4_proc_exit_net(struct net *net) { remove_proc_entry("icmp", net->proc_net); } static struct pernet_operations ping_v4_net_ops = { .init = ping_v4_proc_init_net, .exit = ping_v4_proc_exit_net, }; int __init ping_proc_init(void) { return register_pernet_subsys(&ping_v4_net_ops); } void ping_proc_exit(void) { unregister_pernet_subsys(&ping_v4_net_ops); } #endif void __init ping_init(void) { int i; for (i = 0; i < PING_HTABLE_SIZE; i++) INIT_HLIST_HEAD(&ping_table.hash[i]); spin_lock_init(&ping_table.lock); }
52 52 357 1041 1041 356 324 105 356 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. */ #include <linux/completion.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> #include <net/ip6_route.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/rdma_netlink.h> #include <rdma/ib.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> #include "core_priv.h" #include "cma_priv.h" #include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", [RDMA_CM_EVENT_ADDR_ERROR] = "address error", [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", [RDMA_CM_EVENT_REJECTED] = "rejected", [RDMA_CM_EVENT_ESTABLISHED] = "established", [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? cma_events[index] : "unrecognized event"; } EXPORT_SYMBOL(rdma_event_msg); const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return ibcm_reject_msg(reason); if (rdma_protocol_iwarp(id->device, id->port_num)) return iwcm_reject_msg(reason); WARN_ON_ONCE(1); return "unrecognized transport"; } EXPORT_SYMBOL(rdma_reject_msg); /** * rdma_is_consumer_reject - return true if the consumer rejected the connect * request. * @id: Communication identifier that received the REJECT event. * @reason: Value returned in the REJECT event status field. */ static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; if (rdma_protocol_iwarp(id->device, id->port_num)) return reason == -ECONNREFUSED; WARN_ON_ONCE(1); return false; } const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) { const void *p; if (rdma_is_consumer_reject(id, ev->status)) { *data_len = ev->param.conn.private_data_len; p = ev->param.conn.private_data; } else { *data_len = 0; p = NULL; } return p; } EXPORT_SYMBOL(rdma_consumer_reject_data); /** * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. * @id: Communication Identifier */ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); if (id->device->node_type == RDMA_NODE_RNIC) return id_priv->cm_id.iw; return NULL; } EXPORT_SYMBOL(rdma_iw_cm_id); /** * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. * @res: rdma resource tracking entry pointer */ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); return &id_priv->id; } EXPORT_SYMBOL(rdma_res_to_id); static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct rb_root id_table = RB_ROOT; /* Serialize operations of id_table tree */ static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { struct xarray tcp_ps; struct xarray udp_ps; struct xarray ipoib_ps; struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) { return net_generic(net, cma_pernet_id); } static struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); switch (ps) { case RDMA_PS_TCP: return &pernet->tcp_ps; case RDMA_PS_UDP: return &pernet->udp_ps; case RDMA_PS_IPOIB: return &pernet->ipoib_ps; case RDMA_PS_IB: return &pernet->ib_ps; default: return NULL; } } struct id_table_entry { struct list_head id_list; struct rb_node rb_node; }; struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; }; struct rdma_bind_list { enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; void cma_dev_get(struct cma_device *cma_dev) { refcount_inc(&cma_dev->refcount); } void cma_dev_put(struct cma_device *cma_dev) { if (refcount_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie) { struct cma_device *cma_dev; struct cma_device *found_cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) if (filter(cma_dev->device, cookie)) { found_cma_dev = cma_dev; break; } if (found_cma_dev) cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_gid_type(struct cma_device *cma_dev, u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; if (default_gid_type == IB_GID_TYPE_IB && rdma_protocol_roce_eth_encap(cma_dev->device, port)) default_gid_type = IB_GID_TYPE_ROCE; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) return -EINVAL; cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = default_gid_type; return 0; } int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = default_roce_tos; return 0; } struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; } /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *sa_mc; struct { struct work_struct work; struct rdma_cm_event event; } iboe_join; }; struct list_head list; void *context; struct sockaddr_storage addr; u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; #define CMA_VERSION 0x00 struct cma_req_info { struct sockaddr_storage listen_addr_storage; struct sockaddr_storage src_addr_storage; struct ib_device *device; union ib_gid local_gid; __be64 service_id; int port; bool has_gid; u16 pkey; }; static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; /* * The FSM uses a funny double locking where state is protected by both * the handler_mutex and the spinlock. State is not allowed to change * to/from a handler_mutex protected value without also holding * handler_mutex. */ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; if (ndev) { rtnl_lock(); in_dev = __in_dev_get_rtnl(ndev); if (in_dev) { if (join) ip_mc_inc_group(in_dev, *(__be32 *)(mgid->raw + 12)); else ip_mc_dec_group(in_dev, *(__be32 *)(mgid->raw + 12)); } rtnl_unlock(); } return (in_dev) ? 0 : -ENODEV; } static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, struct id_table_entry *entry_b) { struct rdma_id_private *id_priv = list_first_entry( &entry_b->id_list, struct rdma_id_private, id_list_entry); int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; struct sockaddr *sb = cma_dst_addr(id_priv); if (ifindex_a != ifindex_b) return (ifindex_a > ifindex_b) ? 1 : -1; if (sa->sa_family != sb->sa_family) return sa->sa_family - sb->sa_family; if (sa->sa_family == AF_INET && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { return memcmp(&((struct sockaddr_in *)sa)->sin_addr, &((struct sockaddr_in *)sb)->sin_addr, sizeof(((struct sockaddr_in *)sa)->sin_addr)); } if (sa->sa_family == AF_INET6 && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, &((struct sockaddr_in6 *)sb)->sin6_addr); } return -1; } static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) { struct rb_node **new, *parent = NULL; struct id_table_entry *this, *node; unsigned long flags; int result; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; spin_lock_irqsave(&id_table_lock, flags); new = &id_table.rb_node; while (*new) { this = container_of(*new, struct id_table_entry, rb_node); result = compare_netdev_and_ip( node_id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(node_id_priv), this); parent = *new; if (result < 0) new = &((*new)->rb_left); else if (result > 0) new = &((*new)->rb_right); else { list_add_tail(&node_id_priv->id_list_entry, &this->id_list); kfree(node); goto unlock; } } INIT_LIST_HEAD(&node->id_list); list_add_tail(&node_id_priv->id_list_entry, &node->id_list); rb_link_node(&node->rb_node, parent, new); rb_insert_color(&node->rb_node, &id_table); unlock: spin_unlock_irqrestore(&id_table_lock, flags); return 0; } static struct id_table_entry * node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) { struct rb_node *node = root->rb_node; struct id_table_entry *data; int result; while (node) { data = container_of(node, struct id_table_entry, rb_node); result = compare_netdev_and_ip(ifindex, sa, data); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return data; } return NULL; } static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) { struct id_table_entry *data; unsigned long flags; spin_lock_irqsave(&id_table_lock, flags); if (list_empty(&id_priv->id_list_entry)) goto out; data = node_from_ndev_ip(&id_table, id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(id_priv)); if (!data) goto out; list_del_init(&id_priv->id_list_entry); if (list_empty(&data->id_list)) { rb_erase(&data->rb_node, &id_table); kfree(data); } out: spin_unlock_irqrestore(&id_table_lock, flags); } static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->device_item, &cma_dev->id_list); trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { _cma_attach_to_dev(id_priv, cma_dev); id_priv->gid_type = cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(cma_dev->device)]; } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del_init(&id_priv->device_item); cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; id_priv->id.device = NULL; if (id_priv->id.route.addr.dev_addr.sgid_attr) { rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = NULL; } mutex_unlock(&lock); } static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { if (!qkey || (id_priv->qkey && (id_priv->qkey != qkey))) return -EINVAL; id_priv->qkey = qkey; return 0; } static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); } static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { int ret; if (addr->sa_family != AF_IB) { ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; } return ret; } static const struct ib_gid_attr * cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) goto out; /* * For drivers that do not associate more than one net device with * their gid tables, such as iWARP drivers, it is sufficient to * return the first table entry. * * Other driver classes might be included in the future. */ if (rdma_protocol_iwarp(device, port)) { sgid_attr = rdma_get_gid_attr(device, port, 0); if (IS_ERR(sgid_attr)) goto out; rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) { rdma_put_gid_attr(sgid_attr); sgid_attr = ERR_PTR(-ENODEV); } rcu_read_unlock(); goto out; } if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out; } else { gid_type = IB_GID_TYPE_IB; } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); dev_put(ndev); out: return sgid_attr; } static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, const struct ib_gid_attr *sgid_attr) { WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } /** * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute * based on source ip address. * @id_priv: cm_id which should be bound to cma device * * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute * based on source IP address. It returns 0 on success or error code otherwise. * It is applicable to active and passive side cm_id. */ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, gidp, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); cma_attach_to_dev(id_priv, cma_dev); ret = 0; goto out; } } } out: mutex_unlock(&lock); return ret; } /** * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute * @id_priv: cm id to bind to cma device * @listen_id_priv: listener cm id to match against * @req: Pointer to req structure containaining incoming * request information * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when * rdma device matches for listen_id and incoming request. It also verifies * that a GID table entry is present for the source address. * Returns 0 on success, or returns error code otherwise. */ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv, struct cma_req_info *req) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; enum ib_gid_type gid_type; union ib_gid gid; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; if (rdma_protocol_roce(req->device, req->port)) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &gid); else memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; sgid_attr = cma_validate_port(req->device, req->port, gid_type, &gid, id_priv); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); id_priv->id.port_num = req->port; cma_bind_sgid_attr(id_priv, sgid_attr); /* Need to acquire lock to protect against reader * of cma_dev->id_list such as cma_netdev_callback() and * cma_process_remove(). */ mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); rdma_restrack_add(&id_priv->res); return 0; } static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } } } out: if (!ret) { cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); } mutex_unlock(&lock); return ret; } /* * Select the source IB device and address to reach the destination IB address. */ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; unsigned int p; u16 pkey, index; enum ib_port_state port_state; int ret; int i; cma_dev = NULL; addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(cur_dev->device, p, i, &gid); if (ret) continue; if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } if (!cma_dev && (gid.global.subnet_prefix == dgid->global.subnet_prefix) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } } } } mutex_unlock(&lock); return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } static void cma_id_get(struct rdma_id_private *id_priv) { refcount_inc(&id_priv->refcount); } static void cma_id_put(struct rdma_id_private *id_priv) { if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static struct rdma_id_private * __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; id_priv->timeout_set = false; id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) rdma_restrack_parent_name(&id_priv->res, &parent->res); return id_priv; } struct rdma_cm_id * __rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *ret; ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, caller); return &ret->id; } EXPORT_SYMBOL(__rdma_create_kernel_id); struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *ret; ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, NULL); return &ret->id; } EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) { ret = -EINVAL; goto out_err; } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto out_err; } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; out_destroy: ib_destroy_qp(qp); out_err: trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; else pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; } else { ret = -ENOSYS; } if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) qp_attr->timeout = id_priv->timeout; if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_loopback( ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_loopback( &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_loopback( &((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *)src)->sin_addr.s_addr != ((struct sockaddr_in *)dst)->sin_addr.s_addr; case AF_INET6: { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; bool link_local; if (ipv6_addr_cmp(&src_addr6->sin6_addr, &dst_addr6->sin6_addr)) return 1; link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL; /* Link local must match their scope_ids */ return link_local ? (src_addr6->sin6_scope_id != dst_addr6->sin6_scope_id) : 0; } default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: sib = (struct sockaddr_ib *) addr; return htons((u16) (be64_to_cpu(sib->sib_sid) & be64_to_cpu(sib->sib_sid_mask))); default: return 0; } } static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; if (src_addr) { ib = (struct sockaddr_ib *)src_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->sgid, 16); ib->sib_sid = path->service_id; ib->sib_scope_id = 0; } else { ib->sib_pkey = listen_ib->sib_pkey; ib->sib_flowinfo = listen_ib->sib_flowinfo; ib->sib_addr = listen_ib->sib_addr; ib->sib_sid = listen_ib->sib_sid; ib->sib_scope_id = listen_ib->sib_scope_id; } ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } if (dst_addr) { ib = (struct sockaddr_ib *)dst_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->dgid, 16); } } } static void cma_save_ip4_info(struct sockaddr_in *src_addr, struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; } } static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; } } static u16 cma_port_from_service_id(__be64 service_id) { return (u16)be64_to_cpu(service_id); } static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; __be16 port; hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; port = htons(cma_port_from_service_id(service_id)); switch (cma_get_ip_ver(hdr)) { case 4: cma_save_ip4_info((struct sockaddr_in *)src_addr, (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: cma_save_ip6_info((struct sockaddr_in6 *)src_addr, (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; } return 0; } static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { if (ib_event->event == IB_CM_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, ib_event->param.req_rcvd.primary_path); else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; } return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); } static int cma_save_req_info(const struct ib_cm_event *ib_event, struct cma_req_info *req) { const struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd; const struct ib_cm_sidr_req_event_param *sidr_param = &ib_event->param.sidr_req_rcvd; switch (ib_event->event) { case IB_CM_REQ_RECEIVED: req->device = req_param->listen_id->device; req->port = req_param->port; memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; req->port = sidr_param->port; req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; } return 0; } static bool validate_ipv4_net_dev(struct net_device *net_dev, const struct sockaddr_in *dst_addr, const struct sockaddr_in *src_addr) { __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; struct fib_result res; struct flowi4 fl4; int err; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || ipv4_is_loopback(saddr)) return false; memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; rcu_read_lock(); err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); ret = err == 0 && FIB_RES_DEV(res) == net_dev; rcu_read_unlock(); return ret; } static bool validate_ipv6_net_dev(struct net_device *net_dev, const struct sockaddr_in6 *dst_addr, const struct sockaddr_in6 *src_addr) { #if IS_ENABLED(CONFIG_IPV6) const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, NULL, strict); bool ret; if (!rt) return false; ret = rt->rt6i_idev->dev == net_dev; ip6_rt_put(rt); return ret; #else return false; #endif } static bool validate_net_dev(struct net_device *net_dev, const struct sockaddr *daddr, const struct sockaddr *saddr) { const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; switch (daddr->sa_family) { case AF_INET: return saddr->sa_family == AF_INET && validate_ipv4_net_dev(net_dev, daddr4, saddr4); case AF_INET6: return saddr->sa_family == AF_INET6 && validate_ipv6_net_dev(net_dev, daddr6, saddr6); default: return false; } } static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; if (!sgid_attr) return NULL; rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); if (IS_ERR(ndev)) ndev = NULL; else dev_hold(ndev); rcu_read_unlock(); return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, struct cma_req_info *req) { struct sockaddr *listen_addr = (struct sockaddr *)&req->listen_addr_storage; struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, req->service_id); if (err) return ERR_PTR(err); if (rdma_protocol_roce(req->device, req->port)) net_dev = roce_get_net_dev_by_cm_event(ib_event); else net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); return net_dev; } static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } static bool cma_match_private_data(struct rdma_id_private *id_priv, const struct cma_hdr *hdr) { struct sockaddr *addr = cma_src_addr(id_priv); __be32 ip4_addr; struct in6_addr ip6_addr; if (cma_any_addr(addr) && !id_priv->afonly) return true; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (cma_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (cma_get_ip_ver(hdr) != 6) return false; if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) { const struct sockaddr *daddr = (const struct sockaddr *)&req->listen_addr_storage; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; /* Returns true if the req is for IPv6 link local */ return (daddr->sa_family == AF_INET6 && (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); } static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* * If the request is not for IPv6 link local, allow matching * request to any netdevice of the one or multiport rdma device. */ if (!cma_is_req_ipv6_ll(req)) return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ if (net_eq(dev_net(net_dev), addr->dev_addr.net) && (!!addr->dev_addr.bound_dev_if == (addr->dev_addr.bound_dev_if == net_dev->ifindex))) return true; else return false; } static struct rdma_id_private *cma_find_listener( const struct rdma_bind_list *bind_list, const struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, const struct cma_req_info *req, const struct net_device *net_dev) { struct rdma_id_private *id_priv, *id_priv_dev; lockdep_assert_held(&lock); if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) return id_priv_dev; } } } return ERR_PTR(-EINVAL); } static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, struct cma_req_info *req, struct net_device **net_dev) { struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice * validation, cm_id lookup under rcu lock. * RCU lock along with netdevice state check, synchronizes with * netdevice migrating to different net namespace and also avoids * case where net namespace doesn't get deleted while lookup is in * progress. * If the device state is not IFF_UP, its properties such as ifindex * and nd_net cannot be trusted to remain valid without rcu lock. * net/core/dev.c change_net_namespace() ensures to synchronize with * ongoing operations on net device after device is closed using * synchronize_net(). */ rcu_read_lock(); if (*net_dev) { /* * If netdevice is down, it is likely that it is administratively * down or it might be migrating to different namespace. * In that case avoid further processing, as the net namespace * or ifindex may change. */ if (((*net_dev)->flags & IFF_UP) == 0) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } if (!validate_net_dev(*net_dev, (struct sockaddr *)&req->src_addr_storage, (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, rdma_ps_from_service_id(req->service_id), cma_port_from_service_id(req->service_id)); id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } return id_priv; } static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); } } static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; lockdep_assert_held(&lock); /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_first_entry(&id_priv->listen_list, struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->device_item); list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { mutex_lock(&lock); _cma_cancel_listens(id_priv); mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: /* * We can avoid doing the rdma_addr_cancel() based on state, * only RDMA_CM_ADDR_QUERY has a work that could still execute. * Notice that the addr_handler work could still be exiting * outside this state, however due to the interaction with the * handler_mutex the work is guaranteed not to touch id_priv * during exit. */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; struct net *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev && !send_only) { enum ib_gid_type gid_type; union ib_gid mgid; gid_type = id_priv->cma_dev->default_gid_type [id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, gid_type); cma_igmp_send(ndev, &mgid, false); } dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, list); list_del(&mc->list); destroy_mc(id_priv, mc); } } static void _destroy_id(struct rdma_id_private *id_priv, enum rdma_cm_state state) { cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } /* * destroy an ID from within the handler_mutex. This ensures that no other * handlers can start running concurrently. */ static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) __releases(&idprv->handler_mutex) { enum rdma_cm_state state; unsigned long flags; trace_cm_id_destroy(id_priv); /* * Setting the state to destroyed under the handler mutex provides a * fence against calling handler callbacks. If this is invoked due to * the failure of a handler callback then it guarentees that no future * handlers will be called. */ lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; id_priv->state = RDMA_CM_DESTROYING; spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); _destroy_id(id_priv, state); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); destroy_id_handler_unlock(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static void cma_set_rep_event_data(struct rdma_cm_event *event, const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; event->ece.vendor_id = rep_data->ece.vendor_id; event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, struct rdma_cm_event *event) { int ret; lockdep_assert_held(&id_priv->handler_mutex); trace_cm_event_handler(id_priv, event); ret = id_priv->id.event_handler(&id_priv->id, event); trace_cm_event_done(id_priv, event, ret); return ret; } static int cma_ib_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; enum rdma_cm_state state; int ret; mutex_lock(&id_priv->handler_mutex); state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, ib_event->param.rej_rcvd.reason)); cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static struct rdma_id_private * cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; const __be64 service_id = ib_event->param.req_rcvd.primary_path->service_id; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private * cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, ib_event->param.sidr_req_rcvd.service_id)) goto err; if (net_dev) { rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } } id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; event->ece.vendor_id = req_data->ece.vendor_id; event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_ib_req_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); trace_cm_req_handler(listen_id, ib_event->event); if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err_unlock; } offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) { destroy_id_handler_unlock(conn_id); goto err_unlock; } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); goto net_dev_put; } if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&conn_id->handler_mutex); err_unlock: mutex_unlock(&listen_id->handler_mutex); net_dev_put: dev_put(net_dev); return ret; } __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { if (addr->sa_family == AF_IB) return ((struct sockaddr_ib *) addr)->sib_sid; return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } EXPORT_SYMBOL(rdma_get_service_id); void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid) { struct rdma_addr *addr = &cm_id->route.addr; if (!cm_id->device) { if (sgid) memset(sgid, 0, sizeof(*sgid)); if (dgid) memset(dgid, 0, sizeof(*dgid)); return; } if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { if (sgid) rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); if (dgid) rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); } else { if (sgid) rdma_addr_get_sgid(&addr->dev_addr, sgid); if (dgid) rdma_addr_get_dgid(&addr->dev_addr, dgid); } } EXPORT_SYMBOL(rdma_read_gids); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: memcpy(cma_src_addr(id_priv), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(id_priv), raddr, rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: goto out; } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC, listen_id); if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } mutex_unlock(&conn_id->handler_mutex); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); id = ib_cm_insert_listen(id_priv->id.device, cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; /* Listening IDs are always destroyed on removal */ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) return -1; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } static int cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev, struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; lockdep_assert_held(&lock); *to_destroy = NULL; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return 0; dev_id_priv = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type, id_priv); if (IS_ERR(dev_id_priv)) return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); rdma_restrack_add(&dev_id_priv->res); cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; mutex_lock(&id_priv->qp_mutex); dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; mutex_unlock(&id_priv->qp_mutex); ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) goto err_listen; list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); return 0; err_listen: /* Caller must destroy this after releasing lock */ *to_destroy = dev_id_priv; dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); return ret; } static int cma_listen_on_all(struct rdma_id_private *id_priv) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; int ret; mutex_lock(&lock); list_add_tail(&id_priv->listen_any_item, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) { /* Prevent racing with cma_process_remove() */ if (to_destroy) list_del_init(&to_destroy->device_item); goto err_listen; } } mutex_unlock(&lock); return 0; err_listen: _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); /** * rdma_set_ack_timeout() - Set the ack timeout of QP associated * with a connection identifier. * @id: Communication identifier to associated with service type. * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. * * This function should be called before rdma_connect() on active side, * and on passive side before rdma_accept(). It is applicable to primary * path only. The timeout will affect the local side of the QP, it is not * negotiated with remote side and zero disables the timer. In case it is * set before rdma_resolve_route, the value will also be used to determine * PacketLifeTime for RoCE. * * Return: 0 for success */ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->timeout = timeout; id_priv->timeout_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_ack_timeout); /** * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the * QP associated with a connection identifier. * @id: Communication identifier to associated with service type. * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK * Timer Field" in the IBTA specification. * * This function should be called before rdma_connect() on active * side, and on passive side before rdma_accept(). The timer value * will be associated with the local QP. When it receives a send it is * not read to handle, typically if the receive queue is empty, an RNR * Retry NAK is returned to the requester with the min_rnr_timer * encoded. The requester will then wait at least the time specified * in the NAK before retrying. The default is zero, which translates * to a minimum RNR Timer value of 655 ms. * * Return: 0 for success */ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) { struct rdma_id_private *id_priv; /* It is a five-bit value */ if (min_rnr_timer & 0xe0) return -EINVAL; if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->min_rnr_timer = min_rnr_timer; id_priv->min_rnr_timer_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_min_rnr_timer); static int route_set_path_rec_inbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_inbound) { route->path_rec_inbound = kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); if (!route->path_rec_inbound) return -ENOMEM; } *route->path_rec_inbound = *path_rec; return 0; } static int route_set_path_rec_outbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_outbound) { route->path_rec_outbound = kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); if (!route->path_rec_outbound) return -ENOMEM; } *route->path_rec_outbound = *path_rec; return 0; } static void cma_query_handler(int status, struct sa_path_rec *path_rec, unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; int i; route = &work->id->id.route; if (status) goto fail; for (i = 0; i < num_prs; i++) { if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) *route->path_rec = path_rec[i]; else if (path_rec[i].flags & IB_PATH_INBOUND) status = route_set_path_rec_inbound(work, &path_rec[i]); else if (path_rec[i].flags & IB_PATH_OUTBOUND) status = route_set_path_rec_outbound(work, &path_rec[i]); else status = -EINVAL; if (status) goto fail; } route->num_pri_alt_paths = 1; queue_work(cma_wq, &work->work); return; fail: work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", status); queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) path_rec.rec_type = SA_PATH_REC_TYPE_OPA; else path_rec.rec_type = SA_PATH_REC_TYPE_IB; rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; switch (cma_family(id_priv)) { case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; break; case AF_INET6: sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; case AF_IB: sib = (struct sockaddr_ib *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_iboe_join_work_handler(struct work_struct *work) { struct cma_multicast *mc = container_of(work, struct cma_multicast, iboe_join.work); struct rdma_cm_event *event = &mc->iboe_join.event; struct rdma_id_private *id_priv = mc->id_priv; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; ret = cma_cm_event_handler(id_priv, event); WARN_ON(ret); out_unlock: mutex_unlock(&id_priv->handler_mutex); if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&event->param.ud.ah_attr); } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; if (work->old_state != 0 || work->new_state != 0) { if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out_unlock; } if (cma_cm_event_handler(id_priv, &work->event)) { cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); goto out_free; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); out_free: if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } static void cma_init_resolve_route_work(struct cma_work *work, struct rdma_id_private *id_priv) { work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } static void enqueue_resolve_addr_work(struct cma_work *work, struct rdma_id_private *id_priv) { /* Balances with cma_id_put() in cma_work_handler */ cma_id_get(id_priv); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); if (!route->path_rec) route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, unsigned long supported_gids, enum ib_gid_type default_gid) { if ((network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) && test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) return IB_GID_TYPE_ROCE_UDP_ENCAP; return default_gid; } /* * cma_iboe_set_path_rec_l2_fields() is helper function which sets * path record type based on GID type. * It also sets up other L2 fields which includes destination mac address * netdev ifindex, of the path record. * It returns the netdev of the bound interface for this path record entry. */ static struct net_device * cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; struct rdma_addr *addr = &route->addr; unsigned long supported_gids; struct net_device *ndev; if (!addr->dev_addr.bound_dev_if) return NULL; ndev = dev_get_by_index(addr->dev_addr.net, addr->dev_addr.bound_dev_if); if (!ndev) return NULL; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); /* Use the hint from IP Stack to select GID Type */ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); route->path_rec->roce.route_resolved = true; sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); return ndev; } int rdma_set_ib_path(struct rdma_cm_id *id, struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } if (rdma_protocol_roce(id->device, id->port_num)) { ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err_free; } dev_put(ndev); } id->route.num_pri_alt_paths = 1; return 0; err_free: kfree(id->route.path_rec); id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { struct net_device *dev; dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } struct iboe_prio_tc_map { int input_prio; int output_tc; bool found; }; static int get_lower_vlan_dev_tc(struct net_device *dev, struct netdev_nested_priv *priv) { struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); else if (dev->num_tc) map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); else map->output_tc = 0; /* We are interested only in first level VLAN device, so always * return 1 to stop iterating over next level devices. */ map->found = true; return 1; } static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. */ if (prio_tc_map.found) return prio_tc_map.output_tc; else if (ndev->num_tc) return netdev_get_prio_tc_map(ndev, prio); else return 0; } static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) { struct sockaddr_in6 *addr6; u16 dport, sport; u32 hash, fl; addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; if ((cma_family(id_priv) != AF_INET6) || !fl) { dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); hash = (u32)sport * 31 + dport; fl = hash & IB_GRH_FLOWLABEL_MASK; } return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct net_device *ndev; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos; mutex_lock(&id_priv->qp_mutex); tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; else route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = iboe_tos_to_sl(ndev, tos); route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate * PacketLifeTime. As per IBTA 12.7.34, * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). * Assuming a negligible local ACK delay, we can use * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks. */ mutex_lock(&id_priv->qp_mutex); if (id_priv->timeout_set && id_priv->timeout) route->path_rec->packet_life_time = id_priv->timeout - 1; else route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; mutex_unlock(&id_priv->qp_mutex); if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } if (rdma_protocol_roce_udp_encap(id_priv->id.device, id_priv->id.port_num)) route->path_rec->flow_label = cma_get_roce_udp_flow_label(id_priv); cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; route->num_pri_alt_paths = 0; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); if (!ret) cma_add_id_to_tree(id_priv); } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static void cma_set_loopback(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; case AF_INET6: ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, 0, 0, 0, htonl(1)); break; default: ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, 0, 0, 0, htonl(1)); break; } } static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; unsigned int p; u16 pkey; int ret; cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) cma_dev = cur_dev; rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } } } if (!cma_dev) { ret = -ENODEV; goto out; } p = 1; port_found: ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; struct sockaddr *addr; struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; /* * Store the previous src address, so that if we fail to acquire * matching rdma device, old address can be restored back, which helps * to cancel the cma listen operation correctly. */ addr = cma_src_addr(id_priv); memcpy(&old_addr, addr, rdma_addr_size(addr)); memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); rdma_restrack_add(&id_priv->res); } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { memcpy(addr, &old_addr, rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (cma_cm_event_handler(id_priv, &event)) { destroy_id_handler_unlock(id_priv); return; } out: mutex_unlock(&id_priv->handler_mutex); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_resolve_ib_dev(id_priv); if (ret) goto err; } rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if ((reuse && id_priv->state != RDMA_CM_LISTEN) || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr *addr; struct sockaddr_ib *sib; u64 sid, mask; __be16 port; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); port = htons(bind_list->port); switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_port = port; break; case AF_INET6: ((struct sockaddr_in6 *) addr)->sin6_port = port; break; case AF_IB: sib = (struct sockaddr_ib *) addr; sid = be64_to_cpu(sib->sib_sid); mask = be64_to_cpu(sib->sib_sid_mask); sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); sib->sib_sid_mask = cpu_to_be64(~0ULL); break; } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; lockdep_assert_held(&lock); bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, snum); if (ret < 0) goto err; bind_list->ps = ps; bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: kfree(bind_list); return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; struct sockaddr *daddr = cma_dst_addr(id_priv); struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); lockdep_assert_held(&lock); hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); __be16 cur_dport = cma_port(cur_daddr); if (id_priv == cur_id) continue; /* different dest port -> unique */ if (!cma_any_port(daddr) && !cma_any_port(cur_daddr) && (dport != cur_dport)) continue; /* different src address -> unique */ if (!cma_any_addr(saddr) && !cma_any_addr(cur_saddr) && cma_addr_cmp(saddr, cur_saddr)) continue; /* different dst address -> unique */ if (!cma_any_addr(daddr) && !cma_any_addr(cur_daddr) && cma_addr_cmp(daddr, cur_daddr)) continue; return -EADDRNOTAVAIL; } return 0; } static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; lockdep_assert_held(&lock); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; int ret; bind_list = cma_ps_find(net, ps, (unsigned short)rover); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, rover); } else { ret = cma_port_is_unique(bind_list, id_priv); if (!ret) cma_bind_port(bind_list, id_priv); } /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; lockdep_assert_held(&lock); snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: return id_priv->id.ps; default: return 0; } } static enum rdma_ucm_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; sib = (struct sockaddr_ib *) cma_src_addr(id_priv); mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; sid = be64_to_cpu(sib->sib_sid) & mask; if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; ps = RDMA_PS_UDP; } if (ps) { sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | be64_to_cpu(sib->sib_sid_mask)); } return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) ps = cma_select_inet_ps(id_priv); else ps = cma_select_ib_ps(id_priv); if (!ps) return -EPROTONOSUPPORT; mutex_lock(&lock); if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) return 0; if (!sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { struct sockaddr_in any_in = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))) return -EINVAL; } /* * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable * any more, and has to be unique in the bind list. */ if (id_priv->reuseaddr) { mutex_lock(&lock); ret = cma_check_port(id_priv->bind_list, id_priv, 0); if (!ret) id_priv->reuseaddr = 0; mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; } else { ret = -ENOSYS; goto err; } } else { ret = cma_listen_on_all(id_priv); if (ret) goto err; } return 0; err: id_priv->backlog = 0; /* * All the failure paths that lead here will not allow the req_handler's * to have run. */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, struct sockaddr *addr, const struct sockaddr *daddr) { struct sockaddr *id_daddr; int ret; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) else if (addr->sa_family == AF_INET6) { struct net *net = id_priv->id.route.addr.dev_addr.net; id_priv->afonly = net->ipv6.sysctl.bindv6only; } #endif } id_daddr = cma_dst_addr(id_priv); if (daddr != id_daddr) memcpy(id_daddr, daddr, rdma_addr_size(addr)); id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) goto err2; if (!cma_any_addr(addr)) rdma_restrack_add(&id_priv->res); return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct sockaddr_storage zero_sock = {}; if (src_addr && src_addr->sa_family) return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); /* * When the src_addr is not specified, automatically supply an any addr */ zero_sock.ss_family = dst_addr->sa_family; if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)&zero_sock; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst_addr; src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; } else if (dst_addr->sa_family == AF_IB) { ((struct sockaddr_ib *)&zero_sock)->sib_pkey = ((struct sockaddr_ib *)dst_addr)->sib_pkey; } return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); } /* * If required, resolve the source address for bind and leave the id_priv in * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior * calls made by ULP, a previously bound ID will not be re-bound and src_addr is * ignored. */ static int resolve_prepare_src(struct rdma_id_private *id_priv, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))) return -EINVAL; } else { memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); } if (cma_family(id_priv) != dst_addr->sa_family) { ret = -EINVAL; goto err_state; } return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; ret = resolve_prepare_src(id_priv, src_addr, dst_addr); if (ret) return ret; if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { /* * The FSM can return back to RDMA_CM_ADDR_BOUND after * rdma_resolve_ip() is called, eg through the error * path in addr_handler(). If this happens the existing * request must be canceled before issuing a new one. * Since canceling a request is a bit slow and this * oddball path is rare, keep track once a request has * been issued. The track turns out to be a permanent * state since this is the only cancel as it is * immediately before rdma_resolve_ip(). */ if (id_priv->used_resolve_ip) rdma_addr_cancel(&id->route.addr.dev_addr); else id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, false, id_priv); } } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) cma_src_addr(id_priv); dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; const struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n", event.status); break; } ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret); event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; } ib_init_ah_attr_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr, rep->sgid_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct ib_cm_id *id; void *private_data; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; trace_cm_send_sidr_req(id_priv); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } req.primary_path = &route->path_rec[0]; req.primary_path_inbound = route->path_rec_inbound; req.primary_path_outbound = route->path_rec_outbound; if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; /* Alternate path SGID attribute currently unsupported */ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; req.ece.vendor_id = id_priv->ece.vendor_id; req.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; cm_id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } /** * rdma_connect_locked - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Same as rdma_connect() but can only be called from the * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. */ int rdma_connect_locked(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto err_state; return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect_locked); /** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with by having * called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP information * for unconnected rdma_cm_id's. The actual operation is based on the * rdma_cm_id's port space. */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; mutex_lock(&id_priv->handler_mutex); ret = rdma_connect_locked(id, conn_param); mutex_unlock(&id_priv->handler_mutex); return ret; } EXPORT_SYMBOL(rdma_connect); /** * rdma_connect_ece - Initiate an active connection request with ECE data. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * @ece: ECE parameters * * See rdma_connect() explanation. */ int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_connect(id, conn_param); } EXPORT_SYMBOL(rdma_connect_ece); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; if (!conn_param) return -EINVAL; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { if (qkey) ret = cma_set_qkey(id_priv, qkey); else ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; } rep.private_data = private_data; rep.private_data_len = private_data_len; trace_cm_send_sidr_rep(id_priv); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. * * This function is for use by kernel ULPs and must be called from under the * handler callback. */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; lockdep_assert_held(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->qkey, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, 0, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_accept(id, conn_param); } EXPORT_SYMBOL(rdma_accept_ece); void rdma_lock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_lock_handler); void rdma_unlock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_unlock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); } else { ret = -ENOSYS; } return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ trace_cm_disconnect(id_priv); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) trace_cm_sent_drep(id_priv); } else { trace_cm_sent_dreq(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else ret = -EINVAL; out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, struct ib_sa_multicast *multicast, struct rdma_cm_event *event, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr; enum ib_gid_type gid_type; struct net_device *ndev; if (status) pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); event->status = status; event->param.ud.private_data = mc->context; if (status) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; return; } dev_addr = &id_priv->id.route.addr.dev_addr; ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); gid_type = id_priv->cma_dev ->default_gid_type[id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; event->event = RDMA_CM_EVENT_MULTICAST_JOIN; if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, ndev, gid_type, &event->param.ud.ah_attr)) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; goto out; } event->param.ud.qp_num = 0xFFFFFF; event->param.ud.qkey = id_priv->qkey; out: dev_put(ndev); } static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct cma_multicast *mc = multicast->context; struct rdma_id_private *id_priv = mc->id_priv; struct rdma_cm_event event = {}; int ret = 0; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); if (!ret) { cma_make_mc_event(status, id_priv, multicast, &event, mc); ret = cma_cm_event_handler(id_priv, &event); } rdma_destroy_ah_attr(&event.param.ud.ah_attr); WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; ib_addr_get_mgid(dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (ret) return ret; if (!id_priv->qkey) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; mgid->raw[1] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; struct ib_sa_multicast ib = {}; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr(addr)) return -EINVAL; gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); ib.rec.pkey = cpu_to_be16(0xffff); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) return -ENODEV; ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) err = -ENOTSUPP; } dev_put(ndev); if (err || !ib.rec.mtu) return err ?: -EINVAL; if (!id_priv->qkey) cma_set_default_qkey(id_priv); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &ib.rec.port_gid); INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); queue_work(cma_wq, &mc->iboe_join.work); return 0; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; /* Not supported for kernel QPs */ if (WARN_ON(id->qp)) return -EINVAL; /* ULP is calling this wrong. */ if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; if (id_priv->id.qp_type != IB_QPT_UD) return -EINVAL; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { ret = cma_join_ib_multicast(id_priv, mc); if (ret) goto out_err; } else { ret = -ENOSYS; goto out_err; } spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); return 0; out_err: kfree(mc); return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) continue; list_del(&mc->list); spin_unlock_irq(&id_priv->lock); WARN_ON(id_priv->cma_dev->device != id->device); destroy_mc(id_priv, mc); return; } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->ifindex) && (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { pr_info("RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; cma_id_get(id_priv); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static void cma_netevent_work_handler(struct work_struct *_work) { struct rdma_id_private *id_priv = container_of(_work, struct rdma_id_private, id.net_work); struct rdma_cm_event event = {}; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; if (cma_cm_event_handler(id_priv, &event)) { __acquire(&id_priv->handler_mutex); id_priv->cm_id.ib = NULL; cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); return; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); } static int cma_netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct id_table_entry *ips_node = NULL; struct rdma_id_private *current_id; struct neighbour *neigh = ctx; unsigned long flags; if (event != NETEVENT_NEIGH_UPDATE) return NOTIFY_DONE; spin_lock_irqsave(&id_table_lock, flags); if (neigh->tbl->family == AF_INET6) { struct sockaddr_in6 neigh_sock_6; neigh_sock_6.sin6_family = AF_INET6; neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_6); } else if (neigh->tbl->family == AF_INET) { struct sockaddr_in neigh_sock_4; neigh_sock_4.sin_family = AF_INET; neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_4); } else goto out; if (!ips_node) goto out; list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, neigh->ha, ETH_ALEN)) continue; INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler); cma_id_get(current_id); queue_work(cma_wq, &current_id->id.net_work); } out: spin_unlock_irqrestore(&id_table_lock, flags); return NOTIFY_DONE; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static struct notifier_block cma_netevent_cb = { .notifier_call = cma_netevent_callback }; static void cma_send_device_removal_put(struct rdma_id_private *id_priv) { struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; enum rdma_cm_state state; unsigned long flags; mutex_lock(&id_priv->handler_mutex); /* Record that we want to remove the device */ spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); return; } id_priv->state = RDMA_CM_DEVICE_REMOVAL; spin_unlock_irqrestore(&id_priv->lock, flags); if (cma_cm_event_handler(id_priv, &event)) { /* * At this point the ULP promises it won't call * rdma_destroy_id() concurrently */ cma_id_put(id_priv); mutex_unlock(&id_priv->handler_mutex); trace_cm_id_destroy(id_priv); _destroy_id(id_priv, state); return; } mutex_unlock(&id_priv->handler_mutex); /* * If this races with destroy then the thread that first assigns state * to a destroying does the cancel. */ cma_cancel_operation(id_priv, state); cma_id_put(id_priv); } static void cma_process_remove(struct cma_device *cma_dev) { mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { struct rdma_id_private *id_priv = list_first_entry( &cma_dev->id_list, struct rdma_id_private, device_item); list_del_init(&id_priv->listen_item); list_del_init(&id_priv->device_item); cma_id_get(id_priv); mutex_unlock(&lock); cma_send_device_removal_put(id_priv); mutex_lock(&lock); } mutex_unlock(&lock); cma_dev_put(cma_dev); wait_for_completion(&cma_dev->comp); } static bool cma_supported(struct ib_device *device) { u32 i; rdma_for_each_port(device, i) { if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) return true; } return false; } static int cma_add_one(struct ib_device *device) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; unsigned long supported_gids = 0; int ret; u32 i; if (!cma_supported(device)) return -EOPNOTSUPP; cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); if (!cma_dev->default_gid_type) { ret = -ENOMEM; goto free_cma_dev; } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); if (!cma_dev->default_roce_tos) { ret = -ENOMEM; goto free_gid_type; } rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) cma_dev->default_gid_type[i - rdma_start_port(device)] = CMA_PREFERRED_ROCE_GID_TYPE; else cma_dev->default_gid_type[i - rdma_start_port(device)] = find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } init_completion(&cma_dev->comp); refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) goto free_listen; } mutex_unlock(&lock); trace_cm_add_one(device); return 0; free_listen: list_del(&cma_dev->list); mutex_unlock(&lock); /* cma_process_remove() will delete to_destroy */ cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); return ret; } static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; trace_cm_remove_one(device); mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); kfree(cma_dev->default_gid_type); kfree(cma_dev); } static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); xa_init(&pernet->tcp_ps); xa_init(&pernet->udp_ps); xa_init(&pernet->ipoib_ps); xa_init(&pernet->ib_ps); return 0; } static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); WARN_ON(!xa_empty(&pernet->tcp_ps)); WARN_ON(!xa_empty(&pernet->udp_ps)); WARN_ON(!xa_empty(&pernet->ipoib_ps)); WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { .init = cma_init_net, .exit = cma_exit_net, .id = &cma_pernet_id, .size = sizeof(struct cma_pernet), }; static int __init cma_init(void) { int ret; /* * There is a rare lock ordering dependency in cma_netdev_callback() * that only happens when bonding is enabled. Teach lockdep that rtnl * must never be nested under lock so it can find these without having * to test with bonding. */ if (IS_ENABLED(CONFIG_LOCKDEP)) { rtnl_lock(); mutex_lock(&lock); mutex_unlock(&lock); rtnl_unlock(); } cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; ret = register_pernet_subsys(&cma_pernet_operations); if (ret) goto err_wq; ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) goto err; ret = cma_configfs_init(); if (ret) goto err_ib; return 0; err_ib: ib_unregister_client(&cma_client); err: unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); err_wq: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); } module_init(cma_init); module_exit(cma_cleanup);
25 25 137 110 25 60 105 6 46 46 46 46 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/namespace.c * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc. */ #include <linux/ipc.h> #include <linux/msg.h> #include <linux/ipc_namespace.h> #include <linux/rcupdate.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include "util.h" /* * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. */ static void free_ipc(struct work_struct *unused); static DECLARE_WORK(free_ipc_work, free_ipc); static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); } static void dec_ipc_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES); } static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; again: ucounts = inc_ipc_namespaces(user_ns); if (!ucounts) { /* * IPC namespaces are freed asynchronously, by free_ipc_work. * If frees were pending, flush_work will wait, and * return true. Fail the allocation if no frees are pending. */ if (flush_work(&free_ipc_work)) goto again; goto fail; } err = -ENOMEM; ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free; ns->ns.ops = &ipcns_operations; refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; err = mq_init_ns(ns); if (err) goto fail_put; err = -ENOMEM; if (!setup_mq_sysctls(ns)) goto fail_put; if (!setup_ipc_sysctls(ns)) goto fail_mq; err = msg_init_ns(ns); if (err) goto fail_ipc; sem_init_ns(ns); shm_init_ns(ns); return ns; fail_ipc: retire_ipc_sysctls(ns); fail_mq: retire_mq_sysctls(ns); fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); fail_free: kfree(ns); fail_dec: dec_ipc_namespaces(ucounts); fail: return ERR_PTR(err); } struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); return create_ipc_ns(user_ns, ns); } /* * free_ipcs - free all ipcs of one type * @ns: the namespace to remove the ipcs from * @ids: the table of ipcs to free * @free: the function called to free each individual ipc * * Called for each kind of ipc when an ipc_namespace exits. */ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)) { struct kern_ipc_perm *perm; int next_id; int total, in_use; down_write(&ids->rwsem); in_use = ids->in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); total++; } up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) { /* * Caller needs to wait for an RCU grace period to have passed * after making the mount point inaccessible to new accesses. */ mntput(ns->mq_mnt); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); retire_mq_sysctls(ns); retire_ipc_sysctls(ns); dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } static LLIST_HEAD(free_ipc_list); static void free_ipc(struct work_struct *unused) { struct llist_node *node = llist_del_all(&free_ipc_list); struct ipc_namespace *n, *t; llist_for_each_entry_safe(n, t, node, mnt_llist) mnt_make_shortterm(n->mq_mnt); /* Wait for any last users to have gone away. */ synchronize_rcu(); llist_for_each_entry_safe(n, t, node, mnt_llist) free_ipc_ns(n); } /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put * * If this is the last task in the namespace exiting, and * it is dropping the refcount to 0, then it can race with * a task in another ipc namespace but in a mounts namespace * which has this ipcns's mqueuefs mounted, doing some action * with one of the mqueuefs files. That can raise the refcount. * So dropping the refcount, and raising the refcount when * accessing it through the VFS, are protected with mq_lock. * * (Clearly, a task raising the refcount on its own ipc_ns * needn't take mq_lock since it can't race with the last task * in the ipcns exiting). */ void put_ipc_ns(struct ipc_namespace *ns) { if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } } static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) { return container_of(ns, struct ipc_namespace, ns); } static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) ns = get_ipc_ns(nsproxy->ipc_ns); task_unlock(task); return ns ? &ns->ns : NULL; } static void ipcns_put(struct ns_common *ns) { return put_ipc_ns(to_ipc_ns(ns)); } static int ipcns_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; } static struct user_namespace *ipcns_owner(struct ns_common *ns) { return to_ipc_ns(ns)->user_ns; } const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, .owner = ipcns_owner, };
17815 2099 17881 12973 12973 14028 668 14057 14017 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0 #include <linux/memblock.h> #include <linux/mmdebug.h> #include <linux/export.h> #include <linux/mm.h> #include <asm/page.h> #include <linux/vmalloc.h> #include "physaddr.h" #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { x = y + phys_base; VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); } else { x = y + (__START_KERNEL_map - PAGE_OFFSET); /* carry flag will be set if starting x was >= PAGE_OFFSET */ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); } return x; } EXPORT_SYMBOL(__phys_addr); unsigned long __phys_addr_symbol(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* only check upper bounds since lower bounds will trigger carry */ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); return y + phys_base; } EXPORT_SYMBOL(__phys_addr_symbol); #endif bool __virt_addr_valid(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { x = y + phys_base; if (y >= KERNEL_IMAGE_SIZE) return false; } else { x = y + (__START_KERNEL_map - PAGE_OFFSET); /* carry flag will be set if starting x was >= PAGE_OFFSET */ if ((x > y) || !phys_addr_valid(x)) return false; } return pfn_valid(x >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); #else #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { unsigned long phys_addr = x - PAGE_OFFSET; /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); /* max_low_pfn is set early, but not _that_ early */ if (max_low_pfn) { VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); } return phys_addr; } EXPORT_SYMBOL(__phys_addr); #endif bool __virt_addr_valid(unsigned long x) { if (x < PAGE_OFFSET) return false; if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) return false; if (x >= FIXADDR_START) return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); #endif /* CONFIG_X86_64 */
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 // SPDX-License-Identifier: GPL-2.0 /* net/atm/svc.c - ATM SVC sockets */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/string.h> #include <linux/net.h> /* struct socket, struct proto_ops */ #include <linux/errno.h> /* error codes */ #include <linux/kernel.h> /* printk */ #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/fcntl.h> /* O_NONBLOCK */ #include <linux/init.h> #include <linux/atm.h> /* ATM stuff */ #include <linux/atmsap.h> #include <linux/atmsvc.h> #include <linux/atmdev.h> #include <linux/bitops.h> #include <net/sock.h> /* for sock_no_* */ #include <linux/uaccess.h> #include <linux/export.h> #include "resources.h" #include "common.h" /* common for PVCs and SVCs */ #include "signaling.h" #include "addr.h" #ifdef CONFIG_COMPAT /* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */ #define COMPAT_ATM_ADDPARTY _IOW('a', ATMIOC_SPECIAL + 4, struct compat_atm_iobuf) #endif static int svc_create(struct net *net, struct socket *sock, int protocol, int kern); /* * Note: since all this is still nicely synchronized with the signaling demon, * there's no need to protect sleep loops with clis. If signaling is * moved into the kernel, that would change. */ static int svc_shutdown(struct socket *sock, int how) { return 0; } static void svc_disconnect(struct atm_vcc *vcc) { DEFINE_WAIT(wait); struct sk_buff *skb; struct sock *sk = sk_atm(vcc); pr_debug("%p\n", vcc); if (test_bit(ATM_VF_REGIS, &vcc->flags)) { sigd_enq(vcc, as_close, NULL, NULL, NULL); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); } /* beware - socket is still in use by atmsigd until the last as_indicate has been answered */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { atm_return(vcc, skb->truesize); pr_debug("LISTEN REL\n"); sigd_enq2(NULL, as_reject, vcc, NULL, NULL, &vcc->qos, 0); dev_kfree_skb(skb); } clear_bit(ATM_VF_REGIS, &vcc->flags); /* ... may retry later */ } static int svc_release(struct socket *sock) { struct sock *sk = sock->sk; struct atm_vcc *vcc; if (sk) { vcc = ATM_SD(sock); pr_debug("%p\n", vcc); clear_bit(ATM_VF_READY, &vcc->flags); /* * VCC pointer is used as a reference, * so we must not free it (thereby subjecting it to re-use) * before all pending connections are closed */ svc_disconnect(vcc); vcc_release(sock); } return 0; } static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct sockaddr_atmsvc *addr; struct atm_vcc *vcc; int error; if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) return -EINVAL; lock_sock(sk); if (sock->state == SS_CONNECTED) { error = -EISCONN; goto out; } if (sock->state != SS_UNCONNECTED) { error = -EINVAL; goto out; } vcc = ATM_SD(sock); addr = (struct sockaddr_atmsvc *) sockaddr; if (addr->sas_family != AF_ATMSVC) { error = -EAFNOSUPPORT; goto out; } clear_bit(ATM_VF_BOUND, &vcc->flags); /* failing rebind will kill old binding */ /* @@@ check memory (de)allocation on rebind */ if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { error = -EBADFD; goto out; } vcc->local = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */ if (!sigd) { error = -EUNATCH; goto out; } if (!sk->sk_err) set_bit(ATM_VF_BOUND, &vcc->flags); error = -sk->sk_err; out: release_sock(sk); return error; } static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct sockaddr_atmsvc *addr; struct atm_vcc *vcc = ATM_SD(sock); int error; pr_debug("%p\n", vcc); lock_sock(sk); if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) { error = -EINVAL; goto out; } switch (sock->state) { default: error = -EINVAL; goto out; case SS_CONNECTED: error = -EISCONN; goto out; case SS_CONNECTING: if (test_bit(ATM_VF_WAITING, &vcc->flags)) { error = -EALREADY; goto out; } sock->state = SS_UNCONNECTED; if (sk->sk_err) { error = -sk->sk_err; goto out; } break; case SS_UNCONNECTED: addr = (struct sockaddr_atmsvc *) sockaddr; if (addr->sas_family != AF_ATMSVC) { error = -EAFNOSUPPORT; goto out; } if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { error = -EBADFD; goto out; } if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) { error = -EINVAL; goto out; } if (!vcc->qos.txtp.traffic_class && !vcc->qos.rxtp.traffic_class) { error = -EINVAL; goto out; } vcc->remote = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote); if (flags & O_NONBLOCK) { sock->state = SS_CONNECTING; error = -EINPROGRESS; goto out; } error = 0; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { schedule(); if (!signal_pending(current)) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); continue; } pr_debug("*ABORT*\n"); /* * This is tricky: * Kernel ---close--> Demon * Kernel <--close--- Demon * or * Kernel ---close--> Demon * Kernel <--error--- Demon * or * Kernel ---close--> Demon * Kernel <--okay---- Demon * Kernel <--close--- Demon */ sigd_enq(vcc, as_close, NULL, NULL, NULL); while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); schedule(); } if (!sk->sk_err) while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); schedule(); } clear_bit(ATM_VF_REGIS, &vcc->flags); clear_bit(ATM_VF_RELEASED, &vcc->flags); clear_bit(ATM_VF_CLOSE, &vcc->flags); /* we're gone now but may connect later */ error = -EINTR; break; } finish_wait(sk_sleep(sk), &wait); if (error) goto out; if (!sigd) { error = -EUNATCH; goto out; } if (sk->sk_err) { error = -sk->sk_err; goto out; } } vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp); vcc->qos.txtp.pcr = 0; vcc->qos.txtp.min_pcr = 0; error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci); if (!error) sock->state = SS_CONNECTED; else (void)svc_disconnect(vcc); out: release_sock(sk); return error; } static int svc_listen(struct socket *sock, int backlog) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; pr_debug("%p\n", vcc); lock_sock(sk); /* let server handle listen on unbound sockets */ if (test_bit(ATM_VF_SESSION, &vcc->flags)) { error = -EINVAL; goto out; } if (test_bit(ATM_VF_LISTEN, &vcc->flags)) { error = -EADDRINUSE; goto out; } set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { error = -EUNATCH; goto out; } set_bit(ATM_VF_LISTEN, &vcc->flags); vcc_insert_socket(sk); sk->sk_max_ack_backlog = backlog > 0 ? backlog : ATM_BACKLOG_DEFAULT; error = -sk->sk_err; out: release_sock(sk); return error; } static int svc_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sk_buff *skb; struct atmsvc_msg *msg; struct atm_vcc *old_vcc = ATM_SD(sock); struct atm_vcc *new_vcc; int error; lock_sock(sk); error = svc_create(sock_net(sk), newsock, 0, arg->kern); if (error) goto out; new_vcc = ATM_SD(newsock); pr_debug("%p -> %p\n", old_vcc, new_vcc); while (1) { DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (!(skb = skb_dequeue(&sk->sk_receive_queue)) && sigd) { if (test_bit(ATM_VF_RELEASED, &old_vcc->flags)) break; if (test_bit(ATM_VF_CLOSE, &old_vcc->flags)) { error = -sk->sk_err; break; } if (arg->flags & O_NONBLOCK) { error = -EAGAIN; break; } release_sock(sk); schedule(); lock_sock(sk); if (signal_pending(current)) { error = -ERESTARTSYS; break; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(sk), &wait); if (error) goto out; if (!skb) { error = -EUNATCH; goto out; } msg = (struct atmsvc_msg *)skb->data; new_vcc->qos = msg->qos; set_bit(ATM_VF_HASQOS, &new_vcc->flags); new_vcc->remote = msg->svc; new_vcc->local = msg->local; new_vcc->sap = msg->sap; error = vcc_connect(newsock, msg->pvc.sap_addr.itf, msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci); dev_kfree_skb(skb); sk_acceptq_removed(sk); if (error) { sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL, &old_vcc->qos, error); error = error == -EAGAIN ? -EBUSY : error; goto out; } /* wait should be short, so we ignore the non-blocking flag */ set_bit(ATM_VF_WAITING, &new_vcc->flags); sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL); for (;;) { prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd) break; release_sock(sk); schedule(); lock_sock(sk); } finish_wait(sk_sleep(sk_atm(new_vcc)), &wait); if (!sigd) { error = -EUNATCH; goto out; } if (!sk_atm(new_vcc)->sk_err) break; if (sk_atm(new_vcc)->sk_err != ERESTARTSYS) { error = -sk_atm(new_vcc)->sk_err; goto out; } } newsock->state = SS_CONNECTED; out: release_sock(sk); return error; } static int svc_getname(struct socket *sock, struct sockaddr *sockaddr, int peer) { struct sockaddr_atmsvc *addr; addr = (struct sockaddr_atmsvc *) sockaddr; memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local, sizeof(struct sockaddr_atmsvc)); return sizeof(struct sockaddr_atmsvc); } int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) { struct sock *sk = sk_atm(vcc); DEFINE_WAIT(wait); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) { break; } schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) return -EUNATCH; return -sk->sk_err; } static int svc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int value, error = 0; lock_sock(sk); switch (optname) { case SO_ATMSAP: if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) { error = -EINVAL; goto out; } if (copy_from_sockptr(&vcc->sap, optval, optlen)) { error = -EFAULT; goto out; } set_bit(ATM_VF_HASSAP, &vcc->flags); break; case SO_MULTIPOINT: if (level != SOL_ATM || optlen != sizeof(int)) { error = -EINVAL; goto out; } if (copy_from_sockptr(&value, optval, sizeof(int))) { error = -EFAULT; goto out; } if (value == 1) set_bit(ATM_VF_SESSION, &vcc->flags); else if (value == 0) clear_bit(ATM_VF_SESSION, &vcc->flags); else error = -EINVAL; break; default: error = vcc_setsockopt(sock, level, optname, optval, optlen); } out: release_sock(sk); return error; } static int svc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int error = 0, len; lock_sock(sk); if (!__SO_LEVEL_MATCH(optname, level) || optname != SO_ATMSAP) { error = vcc_getsockopt(sock, level, optname, optval, optlen); goto out; } if (get_user(len, optlen)) { error = -EFAULT; goto out; } if (len != sizeof(struct atm_sap)) { error = -EINVAL; goto out; } if (copy_to_user(optval, &ATM_SD(sock)->sap, sizeof(struct atm_sap))) { error = -EFAULT; goto out; } out: release_sock(sk); return error; } static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_addparty, NULL, NULL, (struct sockaddr_atmsvc *) sockaddr); if (flags & O_NONBLOCK) { error = -EINPROGRESS; goto out; } pr_debug("added wait queue\n"); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); error = -xchg(&sk->sk_err_soft, 0); out: release_sock(sk); return error; } static int svc_dropparty(struct socket *sock, int ep_ref) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { error = -EUNATCH; goto out; } error = -xchg(&sk->sk_err_soft, 0); out: release_sock(sk); return error; } static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int error, ep_ref; struct sockaddr_atmsvc sa; struct atm_vcc *vcc = ATM_SD(sock); switch (cmd) { case ATM_ADDPARTY: if (!test_bit(ATM_VF_SESSION, &vcc->flags)) return -EINVAL; if (copy_from_user(&sa, (void __user *) arg, sizeof(sa))) return -EFAULT; error = svc_addparty(sock, (struct sockaddr *)&sa, sizeof(sa), 0); break; case ATM_DROPPARTY: if (!test_bit(ATM_VF_SESSION, &vcc->flags)) return -EINVAL; if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int))) return -EFAULT; error = svc_dropparty(sock, ep_ref); break; default: error = vcc_ioctl(sock, cmd, arg); } return error; } #ifdef CONFIG_COMPAT static int svc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* The definition of ATM_ADDPARTY uses the size of struct atm_iobuf. But actually it takes a struct sockaddr_atmsvc, which doesn't need compat handling. So all we have to do is fix up cmd... */ if (cmd == COMPAT_ATM_ADDPARTY) cmd = ATM_ADDPARTY; if (cmd == ATM_ADDPARTY || cmd == ATM_DROPPARTY) return svc_ioctl(sock, cmd, arg); else return vcc_compat_ioctl(sock, cmd, arg); } #endif /* CONFIG_COMPAT */ static const struct proto_ops svc_proto_ops = { .family = PF_ATMSVC, .owner = THIS_MODULE, .release = svc_release, .bind = svc_bind, .connect = svc_connect, .socketpair = sock_no_socketpair, .accept = svc_accept, .getname = svc_getname, .poll = vcc_poll, .ioctl = svc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, #endif .gettstamp = sock_gettstamp, .listen = svc_listen, .shutdown = svc_shutdown, .setsockopt = svc_setsockopt, .getsockopt = svc_getsockopt, .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, }; static int svc_create(struct net *net, struct socket *sock, int protocol, int kern) { int error; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; sock->ops = &svc_proto_ops; error = vcc_create(net, sock, protocol, AF_ATMSVC, kern); if (error) return error; ATM_SD(sock)->local.sas_family = AF_ATMSVC; ATM_SD(sock)->remote.sas_family = AF_ATMSVC; return 0; } static const struct net_proto_family svc_family_ops = { .family = PF_ATMSVC, .create = svc_create, .owner = THIS_MODULE, }; /* * Initialize the ATM SVC protocol family */ int __init atmsvc_init(void) { return sock_register(&svc_family_ops); } void atmsvc_exit(void) { sock_unregister(PF_ATMSVC); }
52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 // SPDX-License-Identifier: GPL-2.0 /* * Central processing for nfsd. * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched/signal.h> #include <linux/freezer.h> #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> #include <linux/siphash.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/nfslocalio.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" #include "vfs.h" #include "netns.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_SVC atomic_t nfsd_th_cnt = ATOMIC_INIT(0); static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static int nfsd_acl_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_acl_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); #endif static int nfsd_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); /* * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in * nfsctl.c. In particular: * * user_recovery_dirname * user_lease_time * nfsd_versions */ DEFINE_MUTEX(nfsd_mutex); /* * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. * nfsd_drc_max_pages limits the total amount of memory available for * version 4.1 DRC caches. * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. */ DEFINE_SPINLOCK(nfsd_drc_lock); unsigned long nfsd_drc_max_mem; unsigned long nfsd_drc_mem_used; #if IS_ENABLED(CONFIG_NFS_LOCALIO) static const struct svc_version *localio_versions[] = { [1] = &localio_version1, }; #define NFSD_LOCALIO_NRVERS ARRAY_SIZE(localio_versions) #endif /* CONFIG_NFS_LOCALIO */ #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, # endif # if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, # endif }; #define NFSD_ACL_MINVERS 2 #define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[NFSD_MAXVERS+1] = { #if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, #endif [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, #endif }; struct svc_program nfsd_programs[] = { { .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_MAXVERS+1, /* nr of entries in nfsd_version */ .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ .pg_authenticate = svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, }, #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) { .pg_prog = NFS_ACL_PROGRAM, .pg_nvers = NFSD_ACL_NRVERS, .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }, #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ #if IS_ENABLED(CONFIG_NFS_LOCALIO) { .pg_prog = NFS_LOCALIO_PROGRAM, .pg_nvers = NFSD_LOCALIO_NRVERS, .pg_vers = localio_versions, .pg_name = "nfslocalio", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, } #endif /* CONFIG_NFS_LOCALIO */ }; bool nfsd_support_version(int vers) { if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS) return nfsd_version[vers] != NULL; return false; } int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers > NFSD_MAXVERS) return 0; switch(change) { case NFSD_SET: nn->nfsd_versions[vers] = nfsd_support_version(vers); break; case NFSD_CLEAR: nn->nfsd_versions[vers] = false; break; case NFSD_TEST: return nn->nfsd_versions[vers]; case NFSD_AVAIL: return nfsd_support_version(vers); } return 0; } static void nfsd_adjust_nfsd_versions4(struct nfsd_net *nn) { unsigned i; for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { if (nn->nfsd4_minorversions[i]) return; } nfsd_vers(nn, 4, NFSD_CLEAR); } int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION && change != NFSD_AVAIL) return -1; switch(change) { case NFSD_SET: nfsd_vers(nn, 4, NFSD_SET); nn->nfsd4_minorversions[minorversion] = nfsd_vers(nn, 4, NFSD_TEST); break; case NFSD_CLEAR: nn->nfsd4_minorversions[minorversion] = false; nfsd_adjust_nfsd_versions4(nn); break; case NFSD_TEST: return nn->nfsd4_minorversions[minorversion]; case NFSD_AVAIL: return minorversion <= NFSD_SUPPORTED_MINOR_VERSION && nfsd_vers(nn, 4, NFSD_AVAIL); } return 0; } bool nfsd_serv_try_get(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); return (nn && percpu_ref_tryget_live(&nn->nfsd_serv_ref)); } void nfsd_serv_put(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); percpu_ref_put(&nn->nfsd_serv_ref); } static void nfsd_serv_done(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref); complete(&nn->nfsd_serv_confirm_done); } static void nfsd_serv_free(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref); complete(&nn->nfsd_serv_free_done); } /* * Maximum number of nfsd processes */ #define NFSD_MAXSERVS 8192 int nfsd_nrthreads(struct net *net) { int rv = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) rv = nn->nfsd_serv->sv_nrthreads; mutex_unlock(&nfsd_mutex); return rv; } static int nfsd_init_socks(struct net *net, const struct cred *cred) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!list_empty(&nn->nfsd_serv->sv_permsocks)) return 0; error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; return 0; } static int nfsd_users = 0; static int nfsd_startup_generic(void) { int ret; if (nfsd_users++) return 0; ret = nfsd_file_cache_init(); if (ret) goto dec_users; ret = nfs4_state_start(); if (ret) goto out_file_cache; return 0; out_file_cache: nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; } static void nfsd_shutdown_generic(void) { if (--nfsd_users) return; nfs4_state_shutdown(); nfsd_file_cache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) { return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } /** * nfsd_copy_write_verifier - Atomically copy a write verifier * @verf: buffer in which to receive the verifier cookie * @nn: NFS net namespace * * This function provides a wait-free mechanism for copying the * namespace's write verifier without tearing it. */ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { unsigned int seq; do { seq = read_seqbegin(&nn->writeverf_lock); memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); } while (read_seqretry(&nn->writeverf_lock, seq)); } static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) { struct timespec64 now; u64 verf; /* * Because the time value is hashed, y2038 time_t overflow * is irrelevant in this usage. */ ktime_get_raw_ts64(&now); verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); } /** * nfsd_reset_write_verifier - Generate a new write verifier * @nn: NFS net namespace * * This function updates the ->writeverf field of @nn. This field * contains an opaque cookie that, according to Section 18.32.3 of * RFC 8881, "the client can use to determine whether a server has * changed instance state (e.g., server restart) between a call to * WRITE and a subsequent call to either WRITE or COMMIT. This * cookie MUST be unchanged during a single instance of the NFSv4.1 * server and MUST be unique between instances of the NFSv4.1 * server." */ void nfsd_reset_write_verifier(struct nfsd_net *nn) { write_seqlock(&nn->writeverf_lock); nfsd_reset_write_verifier_locked(nn); write_sequnlock(&nn->writeverf_lock); } /* * Crank up a set of per-namespace resources for a new NFSD instance, * including lockd, a duplicate reply cache, an open file cache * instance, and a cache of NFSv4 state objects. */ static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; if (nn->nfsd_net_up) return 0; ret = nfsd_startup_generic(); if (ret) return ret; ret = nfsd_init_socks(net, cred); if (ret) goto out_socks; if (nfsd_needs_lockd(nn) && !nn->lockd_up) { ret = lockd_up(net, cred); if (ret) goto out_socks; nn->lockd_up = true; } ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; ret = nfsd_reply_cache_init(nn); if (ret) goto out_filecache; ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_init_umount_work(nn); #endif nn->nfsd_net_up = true; return 0; out_reply_cache: nfsd_reply_cache_shutdown(nn); out_filecache: nfsd_file_cache_shutdown_net(net); out_lockd: if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } out_socks: nfsd_shutdown_generic(); return ret; } static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nn->nfsd_net_up) return; nfsd_export_flush(net); nfs4_state_shutdown_net(net); nfsd_reply_cache_shutdown(nn); nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } percpu_ref_exit(&nn->nfsd_serv_ref); nn->nfsd_net_up = false; nfsd_shutdown_generic(); } static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inetaddr_notifier = { .notifier_call = nfsd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nfsd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct net_device *dev = ifa->idev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inet6addr_notifier = { .notifier_call = nfsd_inet6addr_event, }; #endif /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); /** * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace * @net: network namespace the NFS service is associated with */ void nfsd_destroy_serv(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; lockdep_assert_held(&nfsd_mutex); percpu_ref_kill_and_confirm(&nn->nfsd_serv_ref, nfsd_serv_done); wait_for_completion(&nn->nfsd_serv_confirm_done); wait_for_completion(&nn->nfsd_serv_free_done); /* percpu_ref_exit is called in nfsd_shutdown_net */ spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = NULL; spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } svc_xprt_destroy_all(serv, net); /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ svc_rpcb_cl