Total coverage: 95392 (6%)of 1834835
165 80 80 17 28 29 28 4 1 1 1 1 8 2 17 26 9 5 47 35 26 3 9 2 10 2 3 7 4 1 5 4 4 2 2 109 1 8 4 2 4 4 48 14 6 1 42 3 13 2 7 4 29 43 97 3 13 1 12 12 5 7 5 14 8 6 10 10 10 10 9 1 7 5 5 28 12 23 1 16 16 7 16 49 5 2 1 7 10 3 6 5 1 277 280 278 1 116 169 165 157 21 28 42 88 7 7 7 7 11 11 8 1 2 2 1 1 10 9 2 11 11 11 11 11 202 203 203 192 11 6 1 5 6 1 5 1 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2021, Red Hat. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <net/sock.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #define MIN_INFO_OPTLEN_SIZE 16 #define MIN_FULL_INFO_OPTLEN_SIZE 40 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { msk_owned_by_me(msk); if (likely(!__mptcp_check_fallback(msk))) return NULL; return msk->first; } static u32 sockopt_seq_reset(const struct sock *sk) { sock_owned_by_me(sk); /* Highbits contain state. Allows to distinguish sockopt_seq * of listener and established: * s0 = new_listener() * sockopt(s0) - seq is 1 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0) * sockopt(s0) - seq increments to 2 on s0 * sockopt(s1) // seq increments to 2 on s1 (different option) * new ssk completes join, inherits options from s0 // seq 2 * Needs sync from mptcp join logic, but ssk->seq == msk->seq * * Set High order bits to sk_state so ssk->seq == msk->seq test * will fail. */ return (u32)sk->sk_state << 24u; } static void sockopt_seq_inc(struct mptcp_sock *msk) { u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff; msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq; } static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval, unsigned int optlen, int *val) { if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(val, optval, sizeof(*val))) return -EFAULT; return 0; } static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; lock_sock(sk); sockopt_seq_inc(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); switch (optname) { case SO_DEBUG: sock_valbool_flag(ssk, SOCK_DBG, !!val); break; case SO_KEEPALIVE: if (ssk->sk_prot->keepalive) ssk->sk_prot->keepalive(ssk, !!val); sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val); break; case SO_PRIORITY: WRITE_ONCE(ssk->sk_priority, val); break; case SO_SNDBUF: case SO_SNDBUFFORCE: ssk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf; break; case SO_RCVBUF: case SO_RCVBUFFORCE: ssk->sk_userlocks |= SOCK_RCVBUF_LOCK; WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); break; case SO_MARK: if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { WRITE_ONCE(ssk->sk_mark, sk->sk_mark); sk_dst_reset(ssk); } break; case SO_INCOMING_CPU: WRITE_ONCE(ssk->sk_incoming_cpu, val); break; } subflow->setsockopt_seq = msk->setsockopt_seq; unlock_sock_fast(ssk, slow); } release_sock(sk); } static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val) { sockptr_t optval = KERNEL_SOCKPTR(&val); struct sock *sk = (struct sock *)msk; int ret; ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, sizeof(val)); if (ret) return ret; mptcp_sol_socket_sync_intval(msk, optname, val); return 0; } static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) { struct sock *sk = (struct sock *)msk; WRITE_ONCE(sk->sk_incoming_cpu, val); mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); } static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val) { sockptr_t optval = KERNEL_SOCKPTR(&val); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int ret; ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, sizeof(val)); if (ret) return ret; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); sock_set_timestamp(sk, optname, !!val); unlock_sock_fast(ssk, slow); } release_sock(sk); return 0; } static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { int val, ret; ret = mptcp_get_int_option(msk, optval, optlen, &val); if (ret) return ret; switch (optname) { case SO_KEEPALIVE: case SO_DEBUG: case SO_MARK: case SO_PRIORITY: case SO_SNDBUF: case SO_SNDBUFFORCE: case SO_RCVBUF: case SO_RCVBUFFORCE: return mptcp_sol_socket_intval(msk, optname, val); case SO_INCOMING_CPU: mptcp_so_incoming_cpu(msk, val); return 0; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val); } return -ENOPROTOOPT; } static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct so_timestamping timestamping; int ret; if (optlen == sizeof(timestamping)) { if (copy_from_sockptr(&timestamping, optval, sizeof(timestamping))) return -EFAULT; } else if (optlen == sizeof(int)) { memset(&timestamping, 0, sizeof(timestamping)); if (copy_from_sockptr(&timestamping.flags, optval, sizeof(int))) return -EFAULT; } else { return -EINVAL; } ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, KERNEL_SOCKPTR(&timestamping), sizeof(timestamping)); if (ret) return ret; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); sock_set_timestamping(sk, optname, timestamping); unlock_sock_fast(ssk, slow); } release_sock(sk); return 0; } static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct linger ling; sockptr_t kopt; int ret; if (optlen < sizeof(ling)) return -EINVAL; if (copy_from_sockptr(&ling, optval, sizeof(ling))) return -EFAULT; kopt = KERNEL_SOCKPTR(&ling); ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling)); if (ret) return ret; lock_sock(sk); sockopt_seq_inc(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); if (!ling.l_onoff) { sock_reset_flag(ssk, SOCK_LINGER); } else { ssk->sk_lingertime = sk->sk_lingertime; sock_set_flag(ssk, SOCK_LINGER); } subflow->setsockopt_seq = msk->setsockopt_seq; unlock_sock_fast(ssk, slow); } release_sock(sk); return 0; } static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; int ret; switch (optname) { case SO_REUSEPORT: case SO_REUSEADDR: case SO_BINDTODEVICE: case SO_BINDTOIFINDEX: lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { release_sock(sk); return PTR_ERR(ssk); } ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen); if (ret == 0) { if (optname == SO_REUSEPORT) sk->sk_reuseport = ssk->sk_reuseport; else if (optname == SO_REUSEADDR) sk->sk_reuse = ssk->sk_reuse; else if (optname == SO_BINDTODEVICE) sk->sk_bound_dev_if = ssk->sk_bound_dev_if; else if (optname == SO_BINDTOIFINDEX) sk->sk_bound_dev_if = ssk->sk_bound_dev_if; } release_sock(sk); return ret; case SO_KEEPALIVE: case SO_PRIORITY: case SO_SNDBUF: case SO_SNDBUFFORCE: case SO_RCVBUF: case SO_RCVBUFFORCE: case SO_MARK: case SO_INCOMING_CPU: case SO_DEBUG: case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW: return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW: return mptcp_setsockopt_sol_socket_timestamping(msk, optname, optval, optlen); case SO_LINGER: return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); case SO_RCVLOWAT: case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: case SO_BUSY_POLL: case SO_PREFER_BUSY_POLL: case SO_BUSY_POLL_BUDGET: /* No need to copy: only relevant for msk */ return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); case SO_NO_CHECK: case SO_DONTROUTE: case SO_BROADCAST: case SO_BSDCOMPAT: case SO_PASSCRED: case SO_PASSPIDFD: case SO_PASSSEC: case SO_RXQ_OVFL: case SO_WIFI_STATUS: case SO_NOFCS: case SO_SELECT_ERR_QUEUE: return 0; } /* SO_OOBINLINE is not supported, let's avoid the related mess * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, * we must be careful with subflows * * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks * explicitly the sk_protocol field * * SO_PEEK_OFF is unsupported, as it is for plain TCP * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows * SO_CNX_ADVICE is currently unsupported, could possibly be relevant, * but likely needs careful design * * SO_ZEROCOPY is currently unsupported, TODO in sndmsg * SO_TXTIME is currently unsupported */ return -EOPNOTSUPP; } static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; int ret = -EOPNOTSUPP; struct sock *ssk; switch (optname) { case IPV6_V6ONLY: case IPV6_TRANSPARENT: case IPV6_FREEBIND: lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { release_sock(sk); return PTR_ERR(ssk); } ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen); if (ret != 0) { release_sock(sk); return ret; } sockopt_seq_inc(msk); switch (optname) { case IPV6_V6ONLY: sk->sk_ipv6only = ssk->sk_ipv6only; break; case IPV6_TRANSPARENT: inet_assign_bit(TRANSPARENT, sk, inet_test_bit(TRANSPARENT, ssk)); break; case IPV6_FREEBIND: inet_assign_bit(FREEBIND, sk, inet_test_bit(FREEBIND, ssk)); break; } release_sock(sk); break; } return ret; } static bool mptcp_supported_sockopt(int level, int optname) { if (level == SOL_IP) { switch (optname) { /* should work fine */ case IP_FREEBIND: case IP_TRANSPARENT: case IP_BIND_ADDRESS_NO_PORT: case IP_LOCAL_PORT_RANGE: /* the following are control cmsg related */ case IP_PKTINFO: case IP_RECVTTL: case IP_RECVTOS: case IP_RECVOPTS: case IP_RETOPTS: case IP_PASSSEC: case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: /* common stuff that need some love */ case IP_TOS: case IP_TTL: case IP_MTU_DISCOVER: case IP_RECVERR: /* possibly less common may deserve some love */ case IP_MINTTL: /* the following is apparently a no-op for plain TCP */ case IP_RECVERR_RFC4884: return true; } /* IP_OPTIONS is not supported, needs subflow care */ /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */ /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF, * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP, * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE, * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP, * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal * with mcast stuff */ /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */ return false; } if (level == SOL_IPV6) { switch (optname) { case IPV6_V6ONLY: /* the following are control cmsg related */ case IPV6_RECVPKTINFO: case IPV6_2292PKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_2292HOPLIMIT: case IPV6_RECVRTHDR: case IPV6_2292RTHDR: case IPV6_RECVHOPOPTS: case IPV6_2292HOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_2292DSTOPTS: case IPV6_RECVTCLASS: case IPV6_FLOWINFO: case IPV6_RECVPATHMTU: case IPV6_RECVORIGDSTADDR: case IPV6_RECVFRAGSIZE: /* the following ones need some love but are quite common */ case IPV6_TCLASS: case IPV6_TRANSPARENT: case IPV6_FREEBIND: case IPV6_PKTINFO: case IPV6_2292PKTOPTIONS: case IPV6_UNICAST_HOPS: case IPV6_MTU_DISCOVER: case IPV6_MTU: case IPV6_RECVERR: case IPV6_FLOWINFO_SEND: case IPV6_FLOWLABEL_MGR: case IPV6_MINHOPCOUNT: case IPV6_DONTFRAG: case IPV6_AUTOFLOWLABEL: /* the following one is a no-op for plain TCP */ case IPV6_RECVERR_RFC4884: return true; } /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are * not supported */ /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF, * IPV6_MULTICAST_IF, IPV6_ADDRFORM, * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST, * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP, * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP, * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER * are not supported better not deal with mcast */ /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */ /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */ /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */ return false; } if (level == SOL_TCP) { switch (optname) { /* the following are no-op or should work just fine */ case TCP_THIN_DUPACK: case TCP_DEFER_ACCEPT: /* the following need some love */ case TCP_MAXSEG: case TCP_NODELAY: case TCP_THIN_LINEAR_TIMEOUTS: case TCP_CONGESTION: case TCP_CORK: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPCNT: case TCP_SYNCNT: case TCP_SAVE_SYN: case TCP_LINGER2: case TCP_WINDOW_CLAMP: case TCP_QUICKACK: case TCP_USER_TIMEOUT: case TCP_TIMESTAMP: case TCP_NOTSENT_LOWAT: case TCP_TX_DELAY: case TCP_INQ: case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return true; } /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */ /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, * TCP_REPAIR_WINDOW are not supported, better avoid this mess */ } return false; } static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; char name[TCP_CA_NAME_MAX]; bool cap_net_admin; int ret; if (optlen < 1) return -EINVAL; ret = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX - 1, optlen)); if (ret < 0) return -EFAULT; name[ret] = 0; cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN); ret = 0; lock_sock(sk); sockopt_seq_inc(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int err; lock_sock(ssk); err = tcp_set_congestion_control(ssk, name, true, cap_net_admin); if (err < 0 && ret == 0) ret = err; subflow->setsockopt_seq = msk->setsockopt_seq; release_sock(ssk); } if (ret == 0) strscpy(msk->ca_name, name, sizeof(msk->ca_name)); release_sock(sk); return ret; } static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max, int (*set_val)(struct sock *, int), int *msk_val, int val) { struct mptcp_subflow_context *subflow; int err = 0; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int ret; lock_sock(ssk); ret = set_val(ssk, val); err = err ? : ret; release_sock(ssk); } if (!err) { *msk_val = val; sockopt_seq_inc(msk); } return err; } static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; sockopt_seq_inc(msk); msk->cork = !!val; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); __tcp_sock_set_cork(ssk, !!val); release_sock(ssk); } if (!val) mptcp_check_and_set_pending(sk); return 0; } static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; sockopt_seq_inc(msk); msk->nodelay = !!val; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); __tcp_sock_set_nodelay(ssk, !!val); release_sock(ssk); } if (val) mptcp_check_and_set_pending(sk); return 0; } static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; int err; err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); if (err != 0) return err; lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { release_sock(sk); return PTR_ERR(ssk); } switch (optname) { case IP_FREEBIND: inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); break; case IP_TRANSPARENT: inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); break; case IP_BIND_ADDRESS_NO_PORT: inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); break; case IP_LOCAL_PORT_RANGE: WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); break; default: release_sock(sk); WARN_ON_ONCE(1); return -EOPNOTSUPP; } sockopt_seq_inc(msk); release_sock(sk); return 0; } static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int err, val; err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); if (err != 0) return err; lock_sock(sk); sockopt_seq_inc(msk); val = READ_ONCE(inet_sk(sk)->tos); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(ssk); __ip_sock_set_tos(ssk, val); unlock_sock_fast(ssk, slow); } release_sock(sk); return 0; } static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { switch (optname) { case IP_FREEBIND: case IP_TRANSPARENT: case IP_BIND_ADDRESS_NO_PORT: case IP_LOCAL_PORT_RANGE: return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); } return -EOPNOTSUPP; } static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; int ret; /* Limit to first subflow, before the connection establishment */ lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { ret = PTR_ERR(ssk); goto unlock; } ret = tcp_setsockopt(ssk, level, optname, optval, optlen); unlock: release_sock(sk); return ret; } static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct mptcp_subflow_context *subflow; int ret = 0; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ret = tcp_setsockopt(ssk, level, optname, optval, optlen); if (ret) break; } return ret; } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (void *)msk; int ret, val; switch (optname) { case TCP_ULP: return -EOPNOTSUPP; case TCP_CONGESTION: return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); case TCP_DEFER_ACCEPT: /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); return 0; case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); } ret = mptcp_get_int_option(msk, optval, optlen, &val); if (ret) return ret; lock_sock(sk); switch (optname) { case TCP_INQ: if (val < 0 || val > 1) ret = -EINVAL; else msk->recvmsg_inq = !!val; break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(msk->notsent_lowat, val); mptcp_write_space(sk); break; case TCP_CORK: ret = __mptcp_setsockopt_sol_tcp_cork(msk, val); break; case TCP_NODELAY: ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val); break; case TCP_KEEPIDLE: ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE, &tcp_sock_set_keepidle_locked, &msk->keepalive_idle, val); break; case TCP_KEEPINTVL: ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL, &tcp_sock_set_keepintvl, &msk->keepalive_intvl, val); break; case TCP_KEEPCNT: ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT, &tcp_sock_set_keepcnt, &msk->keepalive_cnt, val); break; case TCP_MAXSEG: msk->maxseg = val; ret = mptcp_setsockopt_all_sf(msk, SOL_TCP, optname, optval, optlen); break; default: ret = -ENOPROTOOPT; } release_sock(sk); return ret; } int mptcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; pr_debug("msk=%p\n", msk); if (level == SOL_SOCKET) return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); if (!mptcp_supported_sockopt(level, optname)) return -ENOPROTOOPT; /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow * is in TCP fallback, when TCP socket options are passed through * to the one remaining subflow. */ lock_sock(sk); ssk = __mptcp_tcp_fallback(msk); release_sock(sk); if (ssk) return tcp_setsockopt(ssk, level, optname, optval, optlen); if (level == SOL_IP) return mptcp_setsockopt_v4(msk, optname, optval, optlen); if (level == SOL_IPV6) return mptcp_setsockopt_v6(msk, optname, optval, optlen); if (level == SOL_TCP) return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen); return -EOPNOTSUPP; } static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; int ret; lock_sock(sk); ssk = msk->first; if (ssk) goto get; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { ret = PTR_ERR(ssk); goto out; } get: ret = tcp_getsockopt(ssk, level, optname, optval, optlen); out: release_sock(sk); return ret; } void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { struct sock *sk = (struct sock *)msk; u32 flags = 0; bool slow; u32 now; memset(info, 0, sizeof(*info)); info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); if (inet_sk_state_load(sk) == TCP_LISTEN) return; /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { info->mptcpi_limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_endp_signal_max = mptcp_pm_get_endp_signal_max(msk); info->mptcpi_limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); info->mptcpi_endp_laminar_max = mptcp_pm_get_endp_laminar_max(msk); } if (__mptcp_check_fallback(msk)) flags |= MPTCP_INFO_FLAG_FALLBACK; if (READ_ONCE(msk->can_ack)) flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; info->mptcpi_flags = flags; slow = lock_sock_fast(sk); info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); info->mptcpi_token = msk->token; info->mptcpi_write_seq = msk->write_seq; info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits; info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; info->mptcpi_subflows_total = info->mptcpi_extra_subflows + __mptcp_has_initial_subflow(msk); now = tcp_jiffies32; info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv); unlock_sock_fast(sk, slow); mptcp_data_lock(sk); info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv); info->mptcpi_snd_una = msk->snd_una; info->mptcpi_rcv_nxt = msk->ack_seq; info->mptcpi_bytes_acked = msk->bytes_acked; mptcp_data_unlock(sk); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { struct mptcp_info m_info; int len; if (get_user(len, optlen)) return -EFAULT; /* When used only to check if a fallback to TCP happened. */ if (len == 0) return 0; len = min_t(unsigned int, len, sizeof(struct mptcp_info)); mptcp_diag_fill_info(msk, &m_info); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &m_info, len)) return -EFAULT; return 0; } static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd, char __user *optval, u32 copied, int __user *optlen) { u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd)); if (copied) copied += sfd->size_subflow_data; else copied = copylen; if (put_user(copied, optlen)) return -EFAULT; if (copy_to_user(optval, sfd, copylen)) return -EFAULT; return 0; } static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd, char __user *optval, int __user *optlen) { int len, copylen; if (get_user(len, optlen)) return -EFAULT; /* if mptcp_subflow_data size is changed, need to adjust * this function to deal with programs using old version. */ BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE); if (len < MIN_INFO_OPTLEN_SIZE) return -EINVAL; memset(sfd, 0, sizeof(*sfd)); copylen = min_t(unsigned int, len, sizeof(*sfd)); if (copy_from_user(sfd, optval, copylen)) return -EFAULT; /* size_subflow_data is u32, but len is signed */ if (sfd->size_subflow_data > INT_MAX || sfd->size_user > INT_MAX) return -EINVAL; if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE || sfd->size_subflow_data > len) return -EINVAL; if (sfd->num_subflows || sfd->size_kernel) return -EINVAL; return len - sfd->size_subflow_data; } static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; unsigned int sfcount = 0, copied = 0; struct mptcp_subflow_data sfd; char __user *infoptr; int len; len = mptcp_get_subflow_data(&sfd, optval, optlen); if (len < 0) return len; sfd.size_kernel = sizeof(struct tcp_info); sfd.size_user = min_t(unsigned int, sfd.size_user, sizeof(struct tcp_info)); infoptr = optval + sfd.size_subflow_data; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ++sfcount; if (len && len >= sfd.size_user) { struct tcp_info info; tcp_get_info(ssk, &info); if (copy_to_user(infoptr, &info, sfd.size_user)) { release_sock(sk); return -EFAULT; } infoptr += sfd.size_user; copied += sfd.size_user; len -= sfd.size_user; } } release_sock(sk); sfd.num_subflows = sfcount; if (mptcp_put_subflow_data(&sfd, optval, copied, optlen)) return -EFAULT; return 0; } static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a) { const struct inet_sock *inet = inet_sk(sk); memset(a, 0, sizeof(*a)); if (sk->sk_family == AF_INET) { a->sin_local.sin_family = AF_INET; a->sin_local.sin_port = inet->inet_sport; a->sin_local.sin_addr.s_addr = inet->inet_rcv_saddr; if (!a->sin_local.sin_addr.s_addr) a->sin_local.sin_addr.s_addr = inet->inet_saddr; a->sin_remote.sin_family = AF_INET; a->sin_remote.sin_port = inet->inet_dport; a->sin_remote.sin_addr.s_addr = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_family == AF_INET6) { const struct ipv6_pinfo *np = inet6_sk(sk); if (WARN_ON_ONCE(!np)) return; a->sin6_local.sin6_family = AF_INET6; a->sin6_local.sin6_port = inet->inet_sport; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) a->sin6_local.sin6_addr = np->saddr; else a->sin6_local.sin6_addr = sk->sk_v6_rcv_saddr; a->sin6_remote.sin6_family = AF_INET6; a->sin6_remote.sin6_port = inet->inet_dport; a->sin6_remote.sin6_addr = sk->sk_v6_daddr; #endif } } static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; unsigned int sfcount = 0, copied = 0; struct mptcp_subflow_data sfd; char __user *addrptr; int len; len = mptcp_get_subflow_data(&sfd, optval, optlen); if (len < 0) return len; sfd.size_kernel = sizeof(struct mptcp_subflow_addrs); sfd.size_user = min_t(unsigned int, sfd.size_user, sizeof(struct mptcp_subflow_addrs)); addrptr = optval + sfd.size_subflow_data; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ++sfcount; if (len && len >= sfd.size_user) { struct mptcp_subflow_addrs a; mptcp_get_sub_addrs(ssk, &a); if (copy_to_user(addrptr, &a, sfd.size_user)) { release_sock(sk); return -EFAULT; } addrptr += sfd.size_user; copied += sfd.size_user; len -= sfd.size_user; } } release_sock(sk); sfd.num_subflows = sfcount; if (mptcp_put_subflow_data(&sfd, optval, copied, optlen)) return -EFAULT; return 0; } static int mptcp_get_full_info(struct mptcp_full_info *mfi, char __user *optval, int __user *optlen) { int len; BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) != MIN_FULL_INFO_OPTLEN_SIZE); if (get_user(len, optlen)) return -EFAULT; if (len < MIN_FULL_INFO_OPTLEN_SIZE) return -EINVAL; memset(mfi, 0, sizeof(*mfi)); if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE)) return -EFAULT; if (mfi->size_tcpinfo_kernel || mfi->size_sfinfo_kernel || mfi->num_subflows) return -EINVAL; if (mfi->size_sfinfo_user > INT_MAX || mfi->size_tcpinfo_user > INT_MAX) return -EINVAL; return len - MIN_FULL_INFO_OPTLEN_SIZE; } static int mptcp_put_full_info(struct mptcp_full_info *mfi, char __user *optval, u32 copylen, int __user *optlen) { copylen += MIN_FULL_INFO_OPTLEN_SIZE; if (put_user(copylen, optlen)) return -EFAULT; if (copy_to_user(optval, mfi, copylen)) return -EFAULT; return 0; } static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { unsigned int sfcount = 0, copylen = 0; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; void __user *tcpinfoptr, *sfinfoptr; struct mptcp_full_info mfi; int len; len = mptcp_get_full_info(&mfi, optval, optlen); if (len < 0) return len; /* don't bother filling the mptcp info if there is not enough * user-space-provided storage */ if (len > 0) { mptcp_diag_fill_info(msk, &mfi.mptcp_info); copylen += min_t(unsigned int, len, sizeof(struct mptcp_info)); } mfi.size_tcpinfo_kernel = sizeof(struct tcp_info); mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user, sizeof(struct tcp_info)); sfinfoptr = u64_to_user_ptr(mfi.subflow_info); mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info); mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user, sizeof(struct mptcp_subflow_info)); tcpinfoptr = u64_to_user_ptr(mfi.tcp_info); lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_subflow_info sfinfo; struct tcp_info tcp_info; if (sfcount++ >= mfi.size_arrays_user) continue; /* fetch addr/tcp_info only if the user space buffers * are wide enough */ memset(&sfinfo, 0, sizeof(sfinfo)); sfinfo.id = subflow->subflow_id; if (mfi.size_sfinfo_user > offsetof(struct mptcp_subflow_info, addrs)) mptcp_get_sub_addrs(ssk, &sfinfo.addrs); if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user)) goto fail_release; if (mfi.size_tcpinfo_user) { tcp_get_info(ssk, &tcp_info); if (copy_to_user(tcpinfoptr, &tcp_info, mfi.size_tcpinfo_user)) goto fail_release; } tcpinfoptr += mfi.size_tcpinfo_user; sfinfoptr += mfi.size_sfinfo_user; } release_sock(sk); mfi.num_subflows = sfcount; if (mptcp_put_full_info(&mfi, optval, copylen, optlen)) return -EFAULT; return 0; fail_release: release_sock(sk); return -EFAULT; } static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, int __user *optlen, int val) { int len; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &ucval, 1)) return -EFAULT; } else { len = min_t(unsigned int, len, sizeof(int)); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; } return 0; } static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { struct sock *sk = (void *)msk; switch (optname) { case TCP_ULP: case TCP_CONGESTION: case TCP_INFO: case TCP_CC_INFO: case TCP_DEFER_ACCEPT: case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); case TCP_INQ: return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq); case TCP_CORK: return mptcp_put_int_option(msk, optval, optlen, msk->cork); case TCP_NODELAY: return mptcp_put_int_option(msk, optval, optlen, msk->nodelay); case TCP_KEEPIDLE: return mptcp_put_int_option(msk, optval, optlen, msk->keepalive_idle ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ); case TCP_KEEPINTVL: return mptcp_put_int_option(msk, optval, optlen, msk->keepalive_intvl ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ); case TCP_KEEPCNT: return mptcp_put_int_option(msk, optval, optlen, msk->keepalive_cnt ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes)); case TCP_NOTSENT_LOWAT: return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat); case TCP_IS_MPTCP: return mptcp_put_int_option(msk, optval, optlen, 1); case TCP_MAXSEG: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); } return -EOPNOTSUPP; } static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { struct sock *sk = (void *)msk; switch (optname) { case IP_TOS: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos)); case IP_FREEBIND: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(FREEBIND, sk)); case IP_TRANSPARENT: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(TRANSPARENT, sk)); case IP_BIND_ADDRESS_NO_PORT: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); case IP_LOCAL_PORT_RANGE: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->local_port_range)); } return -EOPNOTSUPP; } static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { struct sock *sk = (void *)msk; switch (optname) { case IPV6_V6ONLY: return mptcp_put_int_option(msk, optval, optlen, sk->sk_ipv6only); case IPV6_TRANSPARENT: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(TRANSPARENT, sk)); case IPV6_FREEBIND: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(FREEBIND, sk)); } return -EOPNOTSUPP; } static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { switch (optname) { case MPTCP_INFO: return mptcp_getsockopt_info(msk, optval, optlen); case MPTCP_FULL_INFO: return mptcp_getsockopt_full_info(msk, optval, optlen); case MPTCP_TCPINFO: return mptcp_getsockopt_tcpinfo(msk, optval, optlen); case MPTCP_SUBFLOW_ADDRS: return mptcp_getsockopt_subflow_addrs(msk, optval, optlen); } return -EOPNOTSUPP; } int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; pr_debug("msk=%p\n", msk); /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow * is in TCP fallback, when socket options are passed through * to the one remaining subflow. */ lock_sock(sk); ssk = __mptcp_tcp_fallback(msk); release_sock(sk); if (ssk) return tcp_getsockopt(ssk, level, optname, optval, option); if (level == SOL_IP) return mptcp_getsockopt_v4(msk, optname, optval, option); if (level == SOL_IPV6) return mptcp_getsockopt_v6(msk, optname, optval, option); if (level == SOL_TCP) return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); if (level == SOL_MPTCP) return mptcp_getsockopt_sol_mptcp(msk, optname, optval, option); return -EOPNOTSUPP; } static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; bool keep_open; keep_open = sock_flag(sk, SOCK_KEEPOPEN); if (ssk->sk_prot->keepalive) ssk->sk_prot->keepalive(ssk, keep_open); sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open); ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; ssk->sk_incoming_cpu = sk->sk_incoming_cpu; ssk->sk_ipv6only = sk->sk_ipv6only; __ip_sock_set_tos(ssk, inet_sk(sk)->tos); if (sk->sk_userlocks & tx_rx_locks) { ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) { WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf; } if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); } if (sock_flag(sk, SOCK_LINGER)) { ssk->sk_lingertime = sk->sk_lingertime; sock_set_flag(ssk, SOCK_LINGER); } else { sock_reset_flag(ssk, SOCK_LINGER); } if (sk->sk_mark != ssk->sk_mark) { ssk->sk_mark = sk->sk_mark; sk_dst_reset(ssk); } sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG)); if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops) tcp_set_congestion_control(ssk, msk->ca_name, false, true); __tcp_sock_set_cork(ssk, !!msk->cork); __tcp_sock_set_nodelay(ssk, !!msk->nodelay); tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle); tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl); tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt); tcp_sock_set_maxseg(ssk, msk->maxseg); inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); msk_owned_by_me(msk); ssk->sk_rcvlowat = 0; /* subflows must ignore any latency-related settings: will not affect * the user-space - only the msk is relevant - but will foul the * mptcp scheduler */ tcp_sk(ssk)->notsent_lowat = UINT_MAX; if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { sync_socket_options(msk, ssk); subflow->setsockopt_seq = msk->setsockopt_seq; } } /* unfortunately this is different enough from the tcp version so * that we can't factor it out */ int mptcp_set_rcvlowat(struct sock *sk, int val) { struct mptcp_subflow_context *subflow; int space, cap; /* bpf can land here with a wrong sk type */ if (sk->sk_protocol == IPPROTO_TCP) return -EINVAL; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ if (mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; space = mptcp_space_from_win(sk, val); if (space <= sk->sk_rcvbuf) return 0; /* propagate the rcvbuf changes to all the subflows */ WRITE_ONCE(sk->sk_rcvbuf, space); mptcp_for_each_subflow(mptcp_sk(sk), subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, space); WRITE_ONCE(tcp_sk(ssk)->window_clamp, val); unlock_sock_fast(ssk, slow); } return 0; }
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_QUEUE_H #define _NF_QUEUE_H #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/skbuff.h> /* Each queued (to userspace) skbuff has one of these. */ struct nf_queue_entry { struct list_head list; struct sk_buff *skb; unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct net_device *physin; struct net_device *physout; #endif struct nf_hook_state state; u16 size; /* sizeof(entry) + saved route keys */ /* extra space to store route keys */ }; #define nf_queue_entry_reroute(x) ((void *)x + sizeof(struct nf_queue_entry)) /* Packet queuing */ struct nf_queue_handler { int (*outfn)(struct nf_queue_entry *entry, unsigned int queuenum); void (*nf_hook_drop)(struct net *net); }; void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) { while (*jhash_initval == 0) *jhash_initval = get_random_u32(); } static inline u32 hash_v4(const struct iphdr *iph, u32 initval) { /* packets in either direction go into same queue */ if ((__force u32)iph->saddr < (__force u32)iph->daddr) return jhash_3words((__force u32)iph->saddr, (__force u32)iph->daddr, iph->protocol, initval); return jhash_3words((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, initval); } static inline u32 hash_v6(const struct ipv6hdr *ip6h, u32 initval) { u32 a, b, c; if ((__force u32)ip6h->saddr.s6_addr32[3] < (__force u32)ip6h->daddr.s6_addr32[3]) { a = (__force u32) ip6h->saddr.s6_addr32[3]; b = (__force u32) ip6h->daddr.s6_addr32[3]; } else { b = (__force u32) ip6h->saddr.s6_addr32[3]; a = (__force u32) ip6h->daddr.s6_addr32[3]; } if ((__force u32)ip6h->saddr.s6_addr32[1] < (__force u32)ip6h->daddr.s6_addr32[1]) c = (__force u32) ip6h->saddr.s6_addr32[1]; else c = (__force u32) ip6h->daddr.s6_addr32[1]; return jhash_3words(a, b, c, initval); } static inline u32 hash_bridge(const struct sk_buff *skb, u32 initval) { struct ipv6hdr *ip6h, _ip6h; struct iphdr *iph, _iph; switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), &_iph); if (iph) return hash_v4(iph, initval); break; case htons(ETH_P_IPV6): ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), &_ip6h); if (ip6h) return hash_v6(ip6h, initval); break; } return 0; } static inline u32 nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, u32 initval) { switch (family) { case NFPROTO_IPV4: queue += reciprocal_scale(hash_v4(ip_hdr(skb), initval), queues_total); break; case NFPROTO_IPV6: queue += reciprocal_scale(hash_v6(ipv6_hdr(skb), initval), queues_total); break; case NFPROTO_BRIDGE: queue += reciprocal_scale(hash_bridge(skb, initval), queues_total); break; } return queue; } int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int index, unsigned int verdict); #endif /* _NF_QUEUE_H */
6 6 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/symlink.c - operations for initializing and mounting sysfs * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <net/net_namespace.h> #include "sysfs.h" static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; static int sysfs_get_tree(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; int ret; ret = kernfs_get_tree(fc); if (ret) return ret; if (kfc->new_sb_created) fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return 0; } static void sysfs_fs_context_free(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; if (kfc->ns_tag) kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag); kernfs_free_fs_context(fc); kfree(kfc); } static const struct fs_context_operations sysfs_fs_context_ops = { .free = sysfs_fs_context_free, .get_tree = sysfs_get_tree, }; static int sysfs_init_fs_context(struct fs_context *fc) { struct kernfs_fs_context *kfc; struct net *netns; if (!(fc->sb_flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) return -EPERM; } kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL); if (!kfc) return -ENOMEM; kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); kfc->root = sysfs_root; kfc->magic = SYSFS_MAGIC; fc->fs_private = kfc; fc->ops = &sysfs_fs_context_ops; if (netns) { put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(netns->user_ns); } fc->global = true; return 0; } static void sysfs_kill_sb(struct super_block *sb) { void *ns = (void *)kernfs_super_ns(sb); kernfs_kill_sb(sb); kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); } static struct file_system_type sysfs_fs_type = { .name = "sysfs", .init_fs_context = sysfs_init_fs_context, .kill_sb = sysfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) { int err; sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); sysfs_root_kn = kernfs_root_to_node(sysfs_root); err = register_filesystem(&sysfs_fs_type); if (err) { kernfs_destroy_root(sysfs_root); return err; } return 0; }
5 5 5 8 8 8 8 20 20 1 17 22 14 15 15 3 3 3 3 18 4 4 3 3 1 1 15 1 15 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 // SPDX-License-Identifier: GPL-2.0 /* * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2021 Intel Corporation * Copyright 2023 NXP */ #include <linux/property.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "hci_codec.h" #include "hci_debugfs.h" #include "smp.h" #include "eir.h" #include "msft.h" #include "aosp.h" #include "leds.h" static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, struct sk_buff *skb) { bt_dev_dbg(hdev, "result 0x%2.2x", result); if (hdev->req_status != HCI_REQ_PEND) return; hdev->req_result = result; hdev->req_status = HCI_REQ_DONE; /* Free the request command so it is not used as response */ kfree_skb(hdev->req_skb); hdev->req_skb = NULL; if (skb) { struct sock *sk = hci_skb_sk(skb); /* Drop sk reference if set */ if (sk) sock_put(sk); hdev->req_rsp = skb_get(skb); } wake_up_interruptible(&hdev->req_wait_q); } struct sk_buff *hci_cmd_sync_alloc(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, struct sock *sk) { int len = HCI_COMMAND_HDR_SIZE + plen; struct hci_command_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(len, GFP_ATOMIC); if (!skb) return NULL; hdr = skb_put(skb, HCI_COMMAND_HDR_SIZE); hdr->opcode = cpu_to_le16(opcode); hdr->plen = plen; if (plen) skb_put_data(skb, param, plen); bt_dev_dbg(hdev, "skb len %d", skb->len); hci_skb_pkt_type(skb) = HCI_COMMAND_PKT; hci_skb_opcode(skb) = opcode; /* Grab a reference if command needs to be associated with a sock (e.g. * likely mgmt socket that initiated the command). */ if (sk) { hci_skb_sk(skb) = sk; sock_hold(sk); } return skb; } static void hci_cmd_sync_add(struct hci_request *req, u16 opcode, u32 plen, const void *param, u8 event, struct sock *sk) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); /* If an error occurred during request building, there is no point in * queueing the HCI command. We can simply return. */ if (req->err) return; skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, sk); if (!skb) { bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", opcode); req->err = -ENOMEM; return; } if (skb_queue_empty(&req->cmd_q)) bt_cb(skb)->hci.req_flags |= HCI_REQ_START; hci_skb_event(skb) = event; skb_queue_tail(&req->cmd_q, skb); } static int hci_req_sync_run(struct hci_request *req) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; unsigned long flags; bt_dev_dbg(hdev, "length %u", skb_queue_len(&req->cmd_q)); /* If an error occurred during request building, remove all HCI * commands queued on the HCI request queue. */ if (req->err) { skb_queue_purge(&req->cmd_q); return req->err; } /* Do not allow empty requests */ if (skb_queue_empty(&req->cmd_q)) return -ENODATA; skb = skb_peek_tail(&req->cmd_q); bt_cb(skb)->hci.req_complete_skb = hci_cmd_sync_complete; bt_cb(skb)->hci.req_flags |= HCI_REQ_SKB; spin_lock_irqsave(&hdev->cmd_q.lock, flags); skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q); spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); queue_work(hdev->workqueue, &hdev->cmd_work); return 0; } static void hci_request_init(struct hci_request *req, struct hci_dev *hdev) { skb_queue_head_init(&req->cmd_q); req->hdev = hdev; req->err = 0; } /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout, struct sock *sk) { struct hci_request req; struct sk_buff *skb; int err = 0; bt_dev_dbg(hdev, "Opcode 0x%4.4x", opcode); hci_request_init(&req, hdev); hci_cmd_sync_add(&req, opcode, plen, param, event, sk); hdev->req_status = HCI_REQ_PEND; err = hci_req_sync_run(&req); if (err < 0) return ERR_PTR(err); err = wait_event_interruptible_timeout(hdev->req_wait_q, hdev->req_status != HCI_REQ_PEND, timeout); if (err == -ERESTARTSYS) return ERR_PTR(-EINTR); switch (hdev->req_status) { case HCI_REQ_DONE: err = -bt_to_errno(hdev->req_result); break; case HCI_REQ_CANCELED: err = -hdev->req_result; break; default: err = -ETIMEDOUT; break; } hdev->req_status = 0; hdev->req_result = 0; skb = hdev->req_rsp; hdev->req_rsp = NULL; bt_dev_dbg(hdev, "end: err %d", err); if (err < 0) { kfree_skb(skb); return ERR_PTR(err); } /* If command return a status event skb will be set to NULL as there are * no parameters. */ if (!skb) return ERR_PTR(-ENODATA); return skb; } EXPORT_SYMBOL(__hci_cmd_sync_sk); /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { return __hci_cmd_sync_sk(hdev, opcode, plen, param, 0, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync); /* Send HCI command and wait for command complete event */ struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { struct sk_buff *skb; if (!test_bit(HCI_UP, &hdev->flags)) return ERR_PTR(-ENETDOWN); bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); hci_req_sync_lock(hdev); skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout); hci_req_sync_unlock(hdev); return skb; } EXPORT_SYMBOL(hci_cmd_sync); /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout) { return __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync_ev); /* This function requires the caller holds hdev->req_lock. */ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout, struct sock *sk) { struct sk_buff *skb; u8 status; skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk); /* If command return a status event, skb will be set to -ENODATA */ if (skb == ERR_PTR(-ENODATA)) return 0; if (IS_ERR(skb)) { if (!event) bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld", opcode, PTR_ERR(skb)); return PTR_ERR(skb); } status = skb->data[0]; kfree_skb(skb); return status; } EXPORT_SYMBOL(__hci_cmd_sync_status_sk); int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { return __hci_cmd_sync_status_sk(hdev, opcode, plen, param, 0, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync_status); int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { int err; hci_req_sync_lock(hdev); err = __hci_cmd_sync_status(hdev, opcode, plen, param, timeout); hci_req_sync_unlock(hdev); return err; } EXPORT_SYMBOL(hci_cmd_sync_status); static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); bt_dev_dbg(hdev, ""); /* Dequeue all entries and run them */ while (1) { struct hci_cmd_sync_work_entry *entry; mutex_lock(&hdev->cmd_sync_work_lock); entry = list_first_entry_or_null(&hdev->cmd_sync_work_list, struct hci_cmd_sync_work_entry, list); if (entry) list_del(&entry->list); mutex_unlock(&hdev->cmd_sync_work_lock); if (!entry) break; bt_dev_dbg(hdev, "entry %p", entry); if (entry->func) { int err; hci_req_sync_lock(hdev); err = entry->func(hdev, entry->data); if (entry->destroy) entry->destroy(hdev, entry->data, err); hci_req_sync_unlock(hdev); } kfree(entry); } } static void hci_cmd_sync_cancel_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_cancel_work); cancel_delayed_work_sync(&hdev->cmd_timer); cancel_delayed_work_sync(&hdev->ncmd_timer); atomic_set(&hdev->cmd_cnt, 1); wake_up_interruptible(&hdev->req_wait_q); } static int hci_scan_disable_sync(struct hci_dev *hdev); static int scan_disable_sync(struct hci_dev *hdev, void *data) { return hci_scan_disable_sync(hdev); } static int interleaved_inquiry_sync(struct hci_dev *hdev, void *data) { return hci_inquiry_sync(hdev, DISCOV_INTERLEAVED_INQUIRY_LEN, 0); } static void le_scan_disable(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, le_scan_disable.work); int status; bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) goto _return; status = hci_cmd_sync_queue(hdev, scan_disable_sync, NULL, NULL); if (status) { bt_dev_err(hdev, "failed to disable LE scan: %d", status); goto _return; } /* If we were running LE only scan, change discovery state. If * we were running both LE and BR/EDR inquiry simultaneously, * and BR/EDR inquiry is already finished, stop discovery, * otherwise BR/EDR inquiry will stop discovery when finished. * If we will resolve remote device name, do not change * discovery state. */ if (hdev->discovery.type == DISCOV_TYPE_LE) goto discov_stopped; if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED) goto _return; if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) { if (!test_bit(HCI_INQUIRY, &hdev->flags) && hdev->discovery.state != DISCOVERY_RESOLVING) goto discov_stopped; goto _return; } status = hci_cmd_sync_queue(hdev, interleaved_inquiry_sync, NULL, NULL); if (status) { bt_dev_err(hdev, "inquiry failed: status %d", status); goto discov_stopped; } goto _return; discov_stopped: hci_discovery_set_state(hdev, DISCOVERY_STOPPED); _return: hci_dev_unlock(hdev); } static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup); static int reenable_adv_sync(struct hci_dev *hdev, void *data) { bt_dev_dbg(hdev, ""); if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) return 0; if (hdev->cur_adv_instance) { return hci_schedule_adv_instance_sync(hdev, hdev->cur_adv_instance, true); } else { if (ext_adv_capable(hdev)) { hci_start_ext_adv_sync(hdev, 0x00); } else { hci_update_adv_data_sync(hdev, 0x00); hci_update_scan_rsp_data_sync(hdev, 0x00); hci_enable_advertising_sync(hdev); } } return 0; } static void reenable_adv(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, reenable_adv_work); int status; bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); status = hci_cmd_sync_queue(hdev, reenable_adv_sync, NULL, NULL); if (status) bt_dev_err(hdev, "failed to reenable ADV: %d", status); hci_dev_unlock(hdev); } static void cancel_adv_timeout(struct hci_dev *hdev) { if (hdev->adv_instance_timeout) { hdev->adv_instance_timeout = 0; cancel_delayed_work(&hdev->adv_instance_expire); } } /* For a single instance: * - force == true: The instance will be removed even when its remaining * lifetime is not zero. * - force == false: the instance will be deactivated but kept stored unless * the remaining lifetime is zero. * * For instance == 0x00: * - force == true: All instances will be removed regardless of their timeout * setting. * - force == false: Only instances that have a timeout will be removed. */ int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk, u8 instance, bool force) { struct adv_info *adv_instance, *n, *next_instance = NULL; int err; u8 rem_inst; /* Cancel any timeout concerning the removed instance(s). */ if (!instance || hdev->cur_adv_instance == instance) cancel_adv_timeout(hdev); /* Get the next instance to advertise BEFORE we remove * the current one. This can be the same instance again * if there is only one instance. */ if (instance && hdev->cur_adv_instance == instance) next_instance = hci_get_next_instance(hdev, instance); if (instance == 0x00) { list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { if (!(force || adv_instance->timeout)) continue; rem_inst = adv_instance->instance; err = hci_remove_adv_instance(hdev, rem_inst); if (!err) mgmt_advertising_removed(sk, hdev, rem_inst); } } else { adv_instance = hci_find_adv_instance(hdev, instance); if (force || (adv_instance && adv_instance->timeout && !adv_instance->remaining_time)) { /* Don't advertise a removed instance. */ if (next_instance && next_instance->instance == instance) next_instance = NULL; err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); } } if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) return 0; if (next_instance && !ext_adv_capable(hdev)) return hci_schedule_adv_instance_sync(hdev, next_instance->instance, false); return 0; } static int adv_timeout_expire_sync(struct hci_dev *hdev, void *data) { u8 instance = *(u8 *)data; kfree(data); hci_clear_adv_instance_sync(hdev, NULL, instance, false); if (list_empty(&hdev->adv_instances)) return hci_disable_advertising_sync(hdev); return 0; } static void adv_timeout_expire(struct work_struct *work) { u8 *inst_ptr; struct hci_dev *hdev = container_of(work, struct hci_dev, adv_instance_expire.work); bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); hdev->adv_instance_timeout = 0; if (hdev->cur_adv_instance == 0x00) goto unlock; inst_ptr = kmalloc(1, GFP_KERNEL); if (!inst_ptr) goto unlock; *inst_ptr = hdev->cur_adv_instance; hci_cmd_sync_queue(hdev, adv_timeout_expire_sync, inst_ptr, NULL); unlock: hci_dev_unlock(hdev); } static bool is_interleave_scanning(struct hci_dev *hdev) { return hdev->interleave_scan_state != INTERLEAVE_SCAN_NONE; } static int hci_passive_scan_sync(struct hci_dev *hdev); static void interleave_scan_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, interleave_scan.work); unsigned long timeout; if (hdev->interleave_scan_state == INTERLEAVE_SCAN_ALLOWLIST) { timeout = msecs_to_jiffies(hdev->advmon_allowlist_duration); } else if (hdev->interleave_scan_state == INTERLEAVE_SCAN_NO_FILTER) { timeout = msecs_to_jiffies(hdev->advmon_no_filter_duration); } else { bt_dev_err(hdev, "unexpected error"); return; } hci_passive_scan_sync(hdev); hci_dev_lock(hdev); switch (hdev->interleave_scan_state) { case INTERLEAVE_SCAN_ALLOWLIST: bt_dev_dbg(hdev, "next state: allowlist"); hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; break; case INTERLEAVE_SCAN_NO_FILTER: bt_dev_dbg(hdev, "next state: no filter"); hdev->interleave_scan_state = INTERLEAVE_SCAN_ALLOWLIST; break; case INTERLEAVE_SCAN_NONE: bt_dev_err(hdev, "unexpected error"); } hci_dev_unlock(hdev); /* Don't continue interleaving if it was canceled */ if (is_interleave_scanning(hdev)) queue_delayed_work(hdev->req_workqueue, &hdev->interleave_scan, timeout); } void hci_cmd_sync_init(struct hci_dev *hdev) { INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work); INIT_LIST_HEAD(&hdev->cmd_sync_work_list); mutex_init(&hdev->cmd_sync_work_lock); mutex_init(&hdev->unregister_lock); INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work); INIT_WORK(&hdev->reenable_adv_work, reenable_adv); INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable); INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire); INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work); } static void _hci_cmd_sync_cancel_entry(struct hci_dev *hdev, struct hci_cmd_sync_work_entry *entry, int err) { if (entry->destroy) entry->destroy(hdev, entry->data, err); list_del(&entry->list); kfree(entry); } void hci_cmd_sync_clear(struct hci_dev *hdev) { struct hci_cmd_sync_work_entry *entry, *tmp; cancel_work_sync(&hdev->cmd_sync_work); cancel_work_sync(&hdev->reenable_adv_work); mutex_lock(&hdev->cmd_sync_work_lock); list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); mutex_unlock(&hdev->cmd_sync_work_lock); } void hci_cmd_sync_cancel(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = err; hdev->req_status = HCI_REQ_CANCELED; queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work); } } EXPORT_SYMBOL(hci_cmd_sync_cancel); /* Cancel ongoing command request synchronously: * * - Set result and mark status to HCI_REQ_CANCELED * - Wakeup command sync thread */ void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { /* req_result is __u32 so error must be positive to be properly * propagated. */ hdev->req_result = err < 0 ? -err : err; hdev->req_status = HCI_REQ_CANCELED; wake_up_interruptible(&hdev->req_wait_q); } } EXPORT_SYMBOL(hci_cmd_sync_cancel_sync); /* Submit HCI command to be run in as cmd_sync_work: * * - hdev must _not_ be unregistered */ int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; int err = 0; mutex_lock(&hdev->unregister_lock); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { err = -ENODEV; goto unlock; } entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { err = -ENOMEM; goto unlock; } entry->func = func; entry->data = data; entry->destroy = destroy; mutex_lock(&hdev->cmd_sync_work_lock); list_add_tail(&entry->list, &hdev->cmd_sync_work_list); mutex_unlock(&hdev->cmd_sync_work_lock); queue_work(hdev->req_workqueue, &hdev->cmd_sync_work); unlock: mutex_unlock(&hdev->unregister_lock); return err; } EXPORT_SYMBOL(hci_cmd_sync_submit); /* Queue HCI command: * * - hdev must be running */ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { /* Only queue command if hdev is running which means it had been opened * and is either on init phase or is already up. */ if (!test_bit(HCI_RUNNING, &hdev->flags)) return -ENETDOWN; return hci_cmd_sync_submit(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_queue); static struct hci_cmd_sync_work_entry * _hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) { if (func && entry->func != func) continue; if (data && entry->data != data) continue; if (destroy && entry->destroy != destroy) continue; return entry; } return NULL; } /* Queue HCI command entry once: * * - Lookup if an entry already exist and only if it doesn't creates a new entry * and queue it. */ int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) return 0; return hci_cmd_sync_queue(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_queue_once); /* Run HCI command: * * - hdev must be running * - if on cmd_sync_work then run immediately otherwise queue */ int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { /* Only queue command if hdev is running which means it had been opened * and is either on init phase or is already up. */ if (!test_bit(HCI_RUNNING, &hdev->flags)) return -ENETDOWN; /* If on cmd_sync_work then run immediately otherwise queue */ if (current_work() == &hdev->cmd_sync_work) return func(hdev, data); return hci_cmd_sync_submit(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_run); /* Run HCI command entry once: * * - Lookup if an entry already exist and only if it doesn't creates a new entry * and run it. * - if on cmd_sync_work then run immediately otherwise queue */ int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) return 0; return hci_cmd_sync_run(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_run_once); /* Lookup HCI command entry: * * - Return first entry that matches by function callback or data or * destroy callback. */ struct hci_cmd_sync_work_entry * hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; mutex_lock(&hdev->cmd_sync_work_lock); entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy); mutex_unlock(&hdev->cmd_sync_work_lock); return entry; } EXPORT_SYMBOL(hci_cmd_sync_lookup_entry); /* Cancel HCI command entry */ void hci_cmd_sync_cancel_entry(struct hci_dev *hdev, struct hci_cmd_sync_work_entry *entry) { mutex_lock(&hdev->cmd_sync_work_lock); _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); mutex_unlock(&hdev->cmd_sync_work_lock); } EXPORT_SYMBOL(hci_cmd_sync_cancel_entry); /* Dequeue one HCI command entry: * * - Lookup and cancel first entry that matches. */ bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy); if (!entry) return false; hci_cmd_sync_cancel_entry(hdev, entry); return true; } EXPORT_SYMBOL(hci_cmd_sync_dequeue_once); /* Dequeue HCI command entry: * * - Lookup and cancel any entry that matches by function callback or data or * destroy callback. */ bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; bool ret = false; mutex_lock(&hdev->cmd_sync_work_lock); while ((entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy))) { _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); ret = true; } mutex_unlock(&hdev->cmd_sync_work_lock); return ret; } EXPORT_SYMBOL(hci_cmd_sync_dequeue); int hci_update_eir_sync(struct hci_dev *hdev) { struct hci_cp_write_eir cp; bt_dev_dbg(hdev, ""); if (!hdev_is_powered(hdev)) return 0; if (!lmp_ext_inq_capable(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return 0; memset(&cp, 0, sizeof(cp)); eir_create(hdev, cp.data); if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) return 0; memcpy(hdev->eir, cp.data, sizeof(cp.data)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static u8 get_service_classes(struct hci_dev *hdev) { struct bt_uuid *uuid; u8 val = 0; list_for_each_entry(uuid, &hdev->uuids, list) val |= uuid->svc_hint; return val; } int hci_update_class_sync(struct hci_dev *hdev) { u8 cod[3]; bt_dev_dbg(hdev, ""); if (!hdev_is_powered(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return 0; cod[0] = hdev->minor_class; cod[1] = hdev->major_class; cod[2] = get_service_classes(hdev); if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) cod[1] |= 0x20; if (memcmp(cod, hdev->dev_class, 3) == 0) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod, HCI_CMD_TIMEOUT); } static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) { /* If there is no connection we are OK to advertise. */ if (hci_conn_num(hdev, LE_LINK) == 0) return true; /* Check le_states if there is any connection in peripheral role. */ if (hdev->conn_hash.le_num_peripheral > 0) { /* Peripheral connection state and non connectable mode * bit 20. */ if (!connectable && !(hdev->le_states[2] & 0x10)) return false; /* Peripheral connection state and connectable mode bit 38 * and scannable bit 21. */ if (connectable && (!(hdev->le_states[4] & 0x40) || !(hdev->le_states[2] & 0x20))) return false; } /* Check le_states if there is any connection in central role. */ if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) { /* Central connection state and non connectable mode bit 18. */ if (!connectable && !(hdev->le_states[2] & 0x02)) return false; /* Central connection state and connectable mode bit 35 and * scannable 19. */ if (connectable && (!(hdev->le_states[4] & 0x08) || !(hdev->le_states[2] & 0x08))) return false; } return true; } static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) { /* If privacy is not enabled don't use RPA */ if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) return false; /* If basic privacy mode is enabled use RPA */ if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) return true; /* If limited privacy mode is enabled don't use RPA if we're * both discoverable and bondable. */ if ((flags & MGMT_ADV_FLAG_DISCOV) && hci_dev_test_flag(hdev, HCI_BONDABLE)) return false; /* We're neither bondable nor discoverable in the limited * privacy mode, therefore use RPA. */ return true; } static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) { /* If a random_addr has been set we're advertising or initiating an LE * connection we can't go ahead and change the random address at this * time. This is because the eventual initiator address used for the * subsequently created connection will be undefined (some * controllers use the new address and others the one we had * when the operation started). * * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ if (bacmp(&hdev->random_addr, BDADDR_ANY) && (hci_dev_test_flag(hdev, HCI_LE_ADV) || hci_lookup_le_connect(hdev))) { bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return 0; } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa, HCI_CMD_TIMEOUT); } int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy, bool rpa, u8 *own_addr_type) { int err; /* If privacy is enabled use a resolvable private address. If * current RPA has expired or there is something else than * the current RPA in use, then generate a new one. */ if (rpa) { /* If Controller supports LL Privacy use own address type is * 0x03 */ if (ll_privacy_capable(hdev)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; /* Check if RPA is valid */ if (rpa_valid(hdev)) return 0; err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { bt_dev_err(hdev, "failed to generate new RPA"); return err; } err = hci_set_random_addr_sync(hdev, &hdev->rpa); if (err) return err; return 0; } /* In case of required privacy without resolvable private address, * use an non-resolvable private address. This is useful for active * scanning and non-connectable advertising. */ if (require_privacy) { bdaddr_t nrpa; while (true) { /* The non-resolvable private address is generated * from random six bytes with the two most significant * bits cleared. */ get_random_bytes(&nrpa, 6); nrpa.b[5] &= 0x3f; /* The non-resolvable private address shall not be * equal to the public address. */ if (bacmp(&hdev->bdaddr, &nrpa)) break; } *own_addr_type = ADDR_LE_DEV_RANDOM; return hci_set_random_addr_sync(hdev, &nrpa); } /* If forcing static address is in use or there is no public * address use the static address as random address (but skip * the HCI command if the current random address is already the * static one. * * In case BR/EDR has been disabled on a dual-mode controller * and a static address has been configured, then use that * address instead of the public BR/EDR address. */ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { *own_addr_type = ADDR_LE_DEV_RANDOM; if (bacmp(&hdev->static_addr, &hdev->random_addr)) return hci_set_random_addr_sync(hdev, &hdev->static_addr); return 0; } /* Neither privacy nor static address is being used so use a * public address. */ *own_addr_type = ADDR_LE_DEV_PUBLIC; return 0; } static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_enable *cp; struct hci_cp_ext_adv_set *set; u8 data[sizeof(*cp) + sizeof(*set) * 1]; u8 size; struct adv_info *adv = NULL; /* If request specifies an instance that doesn't exist, fail */ if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; /* If not enabled there is nothing to do */ if (!adv->enabled) return 0; } memset(data, 0, sizeof(data)); cp = (void *)data; set = (void *)cp->data; /* Instance 0x00 indicates all advertising instances will be disabled */ cp->num_of_sets = !!instance; cp->enable = 0x00; set->handle = adv ? adv->handle : instance; size = sizeof(*cp) + sizeof(*set) * cp->num_of_sets; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, size, data, HCI_CMD_TIMEOUT); } static int hci_set_adv_set_random_addr_sync(struct hci_dev *hdev, u8 instance, bdaddr_t *random_addr) { struct hci_cp_le_set_adv_set_rand_addr cp; int err; if (!instance) { /* Instance 0x00 doesn't have an adv_info, instead it uses * hdev->random_addr to track its address so whenever it needs * to be updated this also set the random address since * hdev->random_addr is shared with scan state machine. */ err = hci_set_random_addr_sync(hdev, random_addr); if (err) return err; } memset(&cp, 0, sizeof(cp)); cp.handle = instance; bacpy(&cp.bdaddr, random_addr); return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_ext_adv_params_sync(struct hci_dev *hdev, struct adv_info *adv, const struct hci_cp_le_set_ext_adv_params *cp, struct hci_rp_le_set_ext_adv_params *rp) { struct sk_buff *skb; skb = __hci_cmd_sync(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(*cp), cp, HCI_CMD_TIMEOUT); /* If command return a status event, skb will be set to -ENODATA */ if (skb == ERR_PTR(-ENODATA)) return 0; if (IS_ERR(skb)) { bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld", HCI_OP_LE_SET_EXT_ADV_PARAMS, PTR_ERR(skb)); return PTR_ERR(skb); } if (skb->len != sizeof(*rp)) { bt_dev_err(hdev, "Invalid response length for 0x%4.4x: %u", HCI_OP_LE_SET_EXT_ADV_PARAMS, skb->len); kfree_skb(skb); return -EIO; } memcpy(rp, skb->data, sizeof(*rp)); kfree_skb(skb); if (!rp->status) { hdev->adv_addr_type = cp->own_addr_type; if (!cp->handle) { /* Store in hdev for instance 0 */ hdev->adv_tx_power = rp->tx_power; } else if (adv) { adv->tx_power = rp->tx_power; } } return rp->status; } static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length, HCI_MAX_EXT_AD_LENGTH); u8 len; struct adv_info *adv = NULL; int err; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->adv_data_changed) return 0; } len = eir_create_adv_data(hdev, instance, pdu->data, HCI_MAX_EXT_AD_LENGTH); pdu->length = len; pdu->handle = adv ? adv->handle : instance; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); if (err) return err; /* Update data if the command succeed */ if (adv) { adv->adv_data_changed = false; } else { memcpy(hdev->adv_data, pdu->data, len); hdev->adv_data_len = len; } return 0; } static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_adv_data cp; u8 len; memset(&cp, 0, sizeof(cp)); len = eir_create_adv_data(hdev, instance, cp.data, sizeof(cp.data)); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && memcmp(cp.data, hdev->adv_data, len) == 0) return 0; memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); hdev->adv_data_len = len; cp.length = len; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance) { if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; if (ext_adv_capable(hdev)) return hci_set_ext_adv_data_sync(hdev, instance); return hci_set_adv_data_sync(hdev, instance); } int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; struct hci_rp_le_set_ext_adv_params rp; bool connectable, require_privacy; u32 flags; bdaddr_t random_addr; u8 own_addr_type; int err; struct adv_info *adv; bool secondary_adv; if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; } else { adv = NULL; } /* Updating parameters of an active instance will return a * Command Disallowed error, so we must first disable the * instance if it is active. */ if (adv) { err = hci_disable_ext_adv_instance_sync(hdev, instance); if (err) return err; } flags = hci_adv_instance_flags(hdev, instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. */ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || mgmt_get_connectable(hdev); if (!is_advertising_allowed(hdev, connectable)) return -EPERM; /* Set require_privacy to true only when non-connectable * advertising is used and it is not periodic. * In that case it is fine to use a non-resolvable private address. */ require_privacy = !connectable && !(adv && adv->periodic); err = hci_get_random_address(hdev, require_privacy, adv_use_rpa(hdev, flags), adv, &own_addr_type, &random_addr); if (err < 0) return err; memset(&cp, 0, sizeof(cp)); if (adv) { hci_cpu_to_le24(adv->min_interval, cp.min_interval); hci_cpu_to_le24(adv->max_interval, cp.max_interval); cp.tx_power = adv->tx_power; cp.sid = adv->sid; } else { hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; cp.sid = 0x00; } secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); if (connectable) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); } else if (hci_adv_instance_is_scannable(hdev, instance) || (flags & MGMT_ADV_PARAM_SCAN_RSP)) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); } else { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); } /* If Own_Address_Type equals 0x02 or 0x03, the Peer_Address parameter * contains the peer’s Identity Address and the Peer_Address_Type * parameter contains the peer’s Identity Type (i.e., 0x00 or 0x01). * These parameters are used to locate the corresponding local IRK in * the resolving list; this IRK is used to generate their own address * used in the advertisement. */ if (own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) hci_copy_identity_address(hdev, &cp.peer_addr, &cp.peer_addr_type); cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.handle = adv ? adv->handle : instance; if (flags & MGMT_ADV_FLAG_SEC_2M) { cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_2M; } else if (flags & MGMT_ADV_FLAG_SEC_CODED) { cp.primary_phy = HCI_ADV_PHY_CODED; cp.secondary_phy = HCI_ADV_PHY_CODED; } else { /* In all other cases use 1M */ cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_1M; } err = hci_set_ext_adv_params_sync(hdev, adv, &cp, &rp); if (err) return err; /* Update adv data as tx power is known now */ err = hci_set_ext_adv_data_sync(hdev, cp.handle); if (err) return err; if ((own_addr_type == ADDR_LE_DEV_RANDOM || own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) && bacmp(&random_addr, BDADDR_ANY)) { /* Check if random address need to be updated */ if (adv) { if (!bacmp(&random_addr, &adv->random_addr)) return 0; } else { if (!bacmp(&random_addr, &hdev->random_addr)) return 0; } return hci_set_adv_set_random_addr_sync(hdev, instance, &random_addr); } return 0; } static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_ext_scan_rsp_data, pdu, data, length, HCI_MAX_EXT_AD_LENGTH); u8 len; struct adv_info *adv = NULL; int err; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->scan_rsp_changed) return 0; } len = eir_create_scan_rsp(hdev, instance, pdu->data); pdu->handle = adv ? adv->handle : instance; pdu->length = len; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); if (err) return err; if (adv) { adv->scan_rsp_changed = false; } else { memcpy(hdev->scan_rsp_data, pdu->data, len); hdev->scan_rsp_data_len = len; } return 0; } static int __hci_set_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_scan_rsp_data cp; u8 len; memset(&cp, 0, sizeof(cp)); len = eir_create_scan_rsp(hdev, instance, cp.data); if (hdev->scan_rsp_data_len == len && !memcmp(cp.data, hdev->scan_rsp_data, len)) return 0; memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); hdev->scan_rsp_data_len = len; cp.length = len; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_update_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; if (ext_adv_capable(hdev)) return hci_set_ext_scan_rsp_data_sync(hdev, instance); return __hci_set_scan_rsp_data_sync(hdev, instance); } int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_enable *cp; struct hci_cp_ext_adv_set *set; u8 data[sizeof(*cp) + sizeof(*set) * 1]; struct adv_info *adv; if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; /* If already enabled there is nothing to do */ if (adv->enabled) return 0; } else { adv = NULL; } cp = (void *)data; set = (void *)cp->data; memset(cp, 0, sizeof(*cp)); cp->enable = 0x01; cp->num_of_sets = 0x01; memset(set, 0, sizeof(*set)); set->handle = adv ? adv->handle : instance; /* Set duration per instance since controller is responsible for * scheduling it. */ if (adv && adv->timeout) { u16 duration = adv->timeout * MSEC_PER_SEC; /* Time = N * 10 ms */ set->duration = cpu_to_le16(duration / 10); } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(*cp) + sizeof(*set) * cp->num_of_sets, data, HCI_CMD_TIMEOUT); } int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance) { int err; err = hci_setup_ext_adv_instance_sync(hdev, instance); if (err) return err; err = hci_set_ext_scan_rsp_data_sync(hdev, instance); if (err) return err; return hci_enable_ext_advertising_sync(hdev, instance); } int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; struct adv_info *adv = NULL; /* If periodic advertising already disabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->periodic || !adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); cp.enable = 0x00; cp.handle = instance; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_per_adv_params_sync(struct hci_dev *hdev, u8 instance, u16 min_interval, u16 max_interval) { struct hci_cp_le_set_per_adv_params cp; memset(&cp, 0, sizeof(cp)); if (!min_interval) min_interval = DISCOV_LE_PER_ADV_INT_MIN; if (!max_interval) max_interval = DISCOV_LE_PER_ADV_INT_MAX; cp.handle = instance; cp.min_interval = cpu_to_le16(min_interval); cp.max_interval = cpu_to_le16(max_interval); cp.periodic_properties = 0x0000; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_per_adv_data, pdu, data, length, HCI_MAX_PER_AD_LENGTH); u8 len; struct adv_info *adv = NULL; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->periodic) return 0; } len = eir_create_per_adv_data(hdev, instance, pdu->data); pdu->length = len; pdu->handle = adv ? adv->handle : instance; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); } static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; struct adv_info *adv = NULL; /* If periodic advertising already enabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); if (adv && adv->periodic && adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); cp.enable = 0x01; cp.handle = instance; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Checks if periodic advertising data contains a Basic Announcement and if it * does generates a Broadcast ID and add Broadcast Announcement. */ static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) { u8 bid[3]; u8 ad[HCI_MAX_EXT_AD_LENGTH]; u8 len; /* Skip if NULL adv as instance 0x00 is used for general purpose * advertising so it cannot used for the likes of Broadcast Announcement * as it can be overwritten at any point. */ if (!adv) return 0; /* Check if PA data doesn't contains a Basic Audio Announcement then * there is nothing to do. */ if (!eir_get_service_data(adv->per_adv_data, adv->per_adv_data_len, 0x1851, NULL)) return 0; /* Check if advertising data already has a Broadcast Announcement since * the process may want to control the Broadcast ID directly and in that * case the kernel shall no interfere. */ if (eir_get_service_data(adv->adv_data, adv->adv_data_len, 0x1852, NULL)) return 0; /* Generate Broadcast ID */ get_random_bytes(bid, sizeof(bid)); len = eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid)); memcpy(ad + len, adv->adv_data, adv->adv_data_len); hci_set_adv_instance_data(hdev, adv->instance, len + adv->adv_data_len, ad, 0, NULL); return hci_update_adv_data_sync(hdev, adv->instance); } int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid, u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval) { struct adv_info *adv = NULL; int err; bool added = false; hci_disable_per_advertising_sync(hdev, instance); if (instance) { adv = hci_find_adv_instance(hdev, instance); if (adv) { if (sid != HCI_SID_INVALID && adv->sid != sid) { /* If the SID don't match attempt to find by * SID. */ adv = hci_find_adv_sid(hdev, sid); if (!adv) { bt_dev_err(hdev, "Unable to find adv_info"); return -EINVAL; } } /* Turn it into periodic advertising */ adv->periodic = true; adv->per_adv_data_len = data_len; if (data) memcpy(adv->per_adv_data, data, data_len); adv->flags = flags; } else if (!adv) { /* Create an instance if that could not be found */ adv = hci_add_per_instance(hdev, instance, sid, flags, data_len, data, sync_interval, sync_interval); if (IS_ERR(adv)) return PTR_ERR(adv); adv->pending = false; added = true; } } /* Start advertising */ err = hci_start_ext_adv_sync(hdev, instance); if (err < 0) goto fail; err = hci_adv_bcast_annoucement(hdev, adv); if (err < 0) goto fail; err = hci_set_per_adv_params_sync(hdev, instance, min_interval, max_interval); if (err < 0) goto fail; err = hci_set_per_adv_data_sync(hdev, instance); if (err < 0) goto fail; err = hci_enable_per_advertising_sync(hdev, instance); if (err < 0) goto fail; return 0; fail: if (added) hci_remove_adv_instance(hdev, instance); return err; } static int hci_start_adv_sync(struct hci_dev *hdev, u8 instance) { int err; if (ext_adv_capable(hdev)) return hci_start_ext_adv_sync(hdev, instance); err = hci_update_adv_data_sync(hdev, instance); if (err) return err; err = hci_update_scan_rsp_data_sync(hdev, instance); if (err) return err; return hci_enable_advertising_sync(hdev); } int hci_enable_advertising_sync(struct hci_dev *hdev) { struct adv_info *adv_instance; struct hci_cp_le_set_adv_param cp; u8 own_addr_type, enable = 0x01; bool connectable; u16 adv_min_interval, adv_max_interval; u32 flags; u8 status; if (ext_adv_capable(hdev)) return hci_enable_ext_advertising_sync(hdev, hdev->cur_adv_instance); flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance); adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. */ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || mgmt_get_connectable(hdev); if (!is_advertising_allowed(hdev, connectable)) return -EINVAL; status = hci_disable_advertising_sync(hdev); if (status) return status; /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ hci_dev_clear_flag(hdev, HCI_LE_ADV); /* Set require_privacy to true only when non-connectable * advertising is used. In that case it is fine to use a * non-resolvable private address. */ status = hci_update_random_address_sync(hdev, !connectable, adv_use_rpa(hdev, flags), &own_addr_type); if (status) return status; memset(&cp, 0, sizeof(cp)); if (adv_instance) { adv_min_interval = adv_instance->min_interval; adv_max_interval = adv_instance->max_interval; } else { adv_min_interval = hdev->le_adv_min_interval; adv_max_interval = hdev->le_adv_max_interval; } if (connectable) { cp.type = LE_ADV_IND; } else { if (hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance)) cp.type = LE_ADV_SCAN_IND; else cp.type = LE_ADV_NONCONN_IND; if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) || hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN; adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX; } } cp.min_interval = cpu_to_le16(adv_min_interval); cp.max_interval = cpu_to_le16(adv_max_interval); cp.own_address_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (status) return status; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static int enable_advertising_sync(struct hci_dev *hdev, void *data) { return hci_enable_advertising_sync(hdev); } int hci_enable_advertising(struct hci_dev *hdev) { if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) return 0; return hci_cmd_sync_queue(hdev, enable_advertising_sync, NULL, NULL); } int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance, struct sock *sk) { int err; if (!ext_adv_capable(hdev)) return 0; err = hci_disable_ext_adv_instance_sync(hdev, instance); if (err) return err; /* If request specifies an instance that doesn't exist, fail */ if (instance > 0 && !hci_find_adv_instance(hdev, instance)) return -EINVAL; return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_REMOVE_ADV_SET, sizeof(instance), &instance, 0, HCI_CMD_TIMEOUT, sk); } int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason) { struct hci_cp_le_term_big cp; memset(&cp, 0, sizeof(cp)); cp.handle = handle; cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_LE_TERM_BIG, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance, bool force) { struct adv_info *adv = NULL; u16 timeout; if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && !ext_adv_capable(hdev)) return -EPERM; if (hdev->adv_instance_timeout) return -EBUSY; adv = hci_find_adv_instance(hdev, instance); if (!adv) return -ENOENT; /* A zero timeout means unlimited advertising. As long as there is * only one instance, duration should be ignored. We still set a timeout * in case further instances are being added later on. * * If the remaining lifetime of the instance is more than the duration * then the timeout corresponds to the duration, otherwise it will be * reduced to the remaining instance lifetime. */ if (adv->timeout == 0 || adv->duration <= adv->remaining_time) timeout = adv->duration; else timeout = adv->remaining_time; /* The remaining time is being reduced unless the instance is being * advertised without time limit. */ if (adv->timeout) adv->remaining_time = adv->remaining_time - timeout; /* Only use work for scheduling instances with legacy advertising */ if (!ext_adv_capable(hdev)) { hdev->adv_instance_timeout = timeout; queue_delayed_work(hdev->req_workqueue, &hdev->adv_instance_expire, secs_to_jiffies(timeout)); } /* If we're just re-scheduling the same instance again then do not * execute any HCI commands. This happens when a single instance is * being advertised. */ if (!force && hdev->cur_adv_instance == instance && hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; hdev->cur_adv_instance = instance; return hci_start_adv_sync(hdev, instance); } static int hci_clear_adv_sets_sync(struct hci_dev *hdev, struct sock *sk) { int err; if (!ext_adv_capable(hdev)) return 0; /* Disable instance 0x00 to disable all instances */ err = hci_disable_ext_adv_instance_sync(hdev, 0x00); if (err) return err; return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL, 0, HCI_CMD_TIMEOUT, sk); } static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force) { struct adv_info *adv, *n; if (ext_adv_capable(hdev)) /* Remove all existing sets */ return hci_clear_adv_sets_sync(hdev, sk); /* This is safe as long as there is no command send while the lock is * held. */ hci_dev_lock(hdev); /* Cleanup non-ext instances */ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) { u8 instance = adv->instance; int err; if (!(force || adv->timeout)) continue; err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); } hci_dev_unlock(hdev); return 0; } static int hci_remove_adv_sync(struct hci_dev *hdev, u8 instance, struct sock *sk) { int err; /* If we use extended advertising, instance has to be removed first. */ if (ext_adv_capable(hdev)) return hci_remove_ext_adv_instance_sync(hdev, instance, sk); /* This is safe as long as there is no command send while the lock is * held. */ hci_dev_lock(hdev); err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); hci_dev_unlock(hdev); return err; } /* For a single instance: * - force == true: The instance will be removed even when its remaining * lifetime is not zero. * - force == false: the instance will be deactivated but kept stored unless * the remaining lifetime is zero. * * For instance == 0x00: * - force == true: All instances will be removed regardless of their timeout * setting. * - force == false: Only instances that have a timeout will be removed. */ int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk, u8 instance, bool force) { struct adv_info *next = NULL; int err; /* Cancel any timeout concerning the removed instance(s). */ if (!instance || hdev->cur_adv_instance == instance) cancel_adv_timeout(hdev); /* Get the next instance to advertise BEFORE we remove * the current one. This can be the same instance again * if there is only one instance. */ if (hdev->cur_adv_instance == instance) next = hci_get_next_instance(hdev, instance); if (!instance) { err = hci_clear_adv_sync(hdev, sk, force); if (err) return err; } else { struct adv_info *adv = hci_find_adv_instance(hdev, instance); if (force || (adv && adv->timeout && !adv->remaining_time)) { /* Don't advertise a removed instance. */ if (next && next->instance == instance) next = NULL; err = hci_remove_adv_sync(hdev, instance, sk); if (err) return err; } } if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) return 0; if (next && !ext_adv_capable(hdev)) hci_schedule_adv_instance_sync(hdev, next->instance, false); return 0; } int hci_read_rssi_sync(struct hci_dev *hdev, __le16 handle) { struct hci_cp_read_rssi cp; cp.handle = handle; return __hci_cmd_sync_status(hdev, HCI_OP_READ_RSSI, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_read_clock_sync(struct hci_dev *hdev, struct hci_cp_read_clock *cp) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLOCK, sizeof(*cp), cp, HCI_CMD_TIMEOUT); } int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type) { struct hci_cp_read_tx_power cp; cp.handle = handle; cp.type = type; return __hci_cmd_sync_status(hdev, HCI_OP_READ_TX_POWER, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_disable_advertising_sync(struct hci_dev *hdev) { u8 enable = 0x00; /* If controller is not advertising we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; if (ext_adv_capable(hdev)) return hci_disable_ext_adv_instance_sync(hdev, 0x00); return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static int hci_le_set_ext_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup) { struct hci_cp_le_set_ext_scan_enable cp; memset(&cp, 0, sizeof(cp)); cp.enable = val; if (hci_dev_test_flag(hdev, HCI_MESH)) cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE; else cp.filter_dup = filter_dup; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup) { struct hci_cp_le_set_scan_enable cp; if (use_ext_scan(hdev)) return hci_le_set_ext_scan_enable_sync(hdev, val, filter_dup); memset(&cp, 0, sizeof(cp)); cp.enable = val; if (val && hci_dev_test_flag(hdev, HCI_MESH)) cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE; else cp.filter_dup = filter_dup; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_set_addr_resolution_enable_sync(struct hci_dev *hdev, u8 val) { if (!ll_privacy_capable(hdev)) return 0; /* If controller is not/already resolving we are done. */ if (val == hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, sizeof(val), &val, HCI_CMD_TIMEOUT); } static int hci_scan_disable_sync(struct hci_dev *hdev) { int err; /* If controller is not scanning we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return 0; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_le_set_scan_enable_sync(hdev, LE_SCAN_DISABLE, 0x00); if (err) { bt_dev_err(hdev, "Unable to disable scanning: %d", err); return err; } return err; } static bool scan_use_rpa(struct hci_dev *hdev) { return hci_dev_test_flag(hdev, HCI_PRIVACY); } static void hci_start_interleave_scan(struct hci_dev *hdev) { hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; queue_delayed_work(hdev->req_workqueue, &hdev->interleave_scan, 0); } static void cancel_interleave_scan(struct hci_dev *hdev) { bt_dev_dbg(hdev, "cancelling interleave scan"); cancel_delayed_work_sync(&hdev->interleave_scan); hdev->interleave_scan_state = INTERLEAVE_SCAN_NONE; } /* Return true if interleave_scan wasn't started until exiting this function, * otherwise, return false */ static bool hci_update_interleaved_scan_sync(struct hci_dev *hdev) { /* Do interleaved scan only if all of the following are true: * - There is at least one ADV monitor * - At least one pending LE connection or one device to be scanned for * - Monitor offloading is not supported * If so, we should alternate between allowlist scan and one without * any filters to save power. */ bool use_interleaving = hci_is_adv_monitoring(hdev) && !(list_empty(&hdev->pend_le_conns) && list_empty(&hdev->pend_le_reports)) && hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE; bool is_interleaving = is_interleave_scanning(hdev); if (use_interleaving && !is_interleaving) { hci_start_interleave_scan(hdev); bt_dev_dbg(hdev, "starting interleave scan"); return true; } if (!use_interleaving && is_interleaving) cancel_interleave_scan(hdev); return false; } /* Removes connection to resolve list if needed.*/ static int hci_le_del_resolve_list_sync(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct hci_cp_le_del_from_resolv_list cp; struct bdaddr_list_with_irk *entry; if (!ll_privacy_capable(hdev)) return 0; /* Check if the IRK has been programmed */ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list, bdaddr, bdaddr_type); if (!entry) return 0; cp.bdaddr_type = bdaddr_type; bacpy(&cp.bdaddr, bdaddr); return __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_del_accept_list_sync(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct hci_cp_le_del_from_accept_list cp; int err; /* Check if device is on accept list before removing it */ if (!hci_bdaddr_list_lookup(&hdev->le_accept_list, bdaddr, bdaddr_type)) return 0; cp.bdaddr_type = bdaddr_type; bacpy(&cp.bdaddr, bdaddr); /* Ignore errors when removing from resolving list as that is likely * that the device was never added. */ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type); err = __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) { bt_dev_err(hdev, "Unable to remove from allow list: %d", err); return err; } bt_dev_dbg(hdev, "Remove %pMR (0x%x) from allow list", &cp.bdaddr, cp.bdaddr_type); return 0; } struct conn_params { bdaddr_t addr; u8 addr_type; hci_conn_flags_t flags; u8 privacy_mode; }; /* Adds connection to resolve list if needed. * Setting params to NULL programs local hdev->irk */ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, struct conn_params *params) { struct hci_cp_le_add_to_resolv_list cp; struct smp_irk *irk; struct bdaddr_list_with_irk *entry; struct hci_conn_params *p; if (!ll_privacy_capable(hdev)) return 0; /* Attempt to program local identity address, type and irk if params is * NULL. */ if (!params) { if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) return 0; hci_copy_identity_address(hdev, &cp.bdaddr, &cp.bdaddr_type); memcpy(cp.peer_irk, hdev->irk, 16); goto done; } else if (!(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION)) return 0; irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type); if (!irk) return 0; /* Check if the IK has _not_ been programmed yet. */ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list, &params->addr, params->addr_type); if (entry) return 0; cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, &params->addr); memcpy(cp.peer_irk, irk->val, 16); /* Default privacy mode is always Network */ params->privacy_mode = HCI_NETWORK_PRIVACY; rcu_read_lock(); p = hci_pend_le_action_lookup(&hdev->pend_le_conns, &params->addr, params->addr_type); if (!p) p = hci_pend_le_action_lookup(&hdev->pend_le_reports, &params->addr, params->addr_type); if (p) WRITE_ONCE(p->privacy_mode, HCI_NETWORK_PRIVACY); rcu_read_unlock(); done: if (hci_dev_test_flag(hdev, HCI_PRIVACY)) memcpy(cp.local_irk, hdev->irk, 16); else memset(cp.local_irk, 0, 16); return __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Set Device Privacy Mode. */ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, struct conn_params *params) { struct hci_cp_le_set_privacy_mode cp; struct smp_irk *irk; if (!ll_privacy_capable(hdev) || !(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION)) return 0; /* If device privacy mode has already been set there is nothing to do */ if (params->privacy_mode == HCI_DEVICE_PRIVACY) return 0; /* Check if HCI_CONN_FLAG_DEVICE_PRIVACY has been set as it also * indicates that LL Privacy has been enabled and * HCI_OP_LE_SET_PRIVACY_MODE is supported. */ if (!(params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)) return 0; irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type); if (!irk) return 0; memset(&cp, 0, sizeof(cp)); cp.bdaddr_type = irk->addr_type; bacpy(&cp.bdaddr, &irk->bdaddr); cp.mode = HCI_DEVICE_PRIVACY; /* Note: params->privacy_mode is not updated since it is a copy */ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PRIVACY_MODE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Adds connection to allow list if needed, if the device uses RPA (has IRK) * this attempts to program the device in the resolving list as well and * properly set the privacy mode. */ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, struct conn_params *params, u8 *num_entries) { struct hci_cp_le_add_to_accept_list cp; int err; /* During suspend, only wakeable devices can be in acceptlist */ if (hdev->suspended && !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) { hci_le_del_accept_list_sync(hdev, &params->addr, params->addr_type); return 0; } /* Select filter policy to accept all advertising */ if (*num_entries >= hdev->le_accept_list_size) return -ENOSPC; /* Attempt to program the device in the resolving list first to avoid * having to rollback in case it fails since the resolving list is * dynamic it can probably be smaller than the accept list. */ err = hci_le_add_resolve_list_sync(hdev, params); if (err) { bt_dev_err(hdev, "Unable to add to resolve list: %d", err); return err; } /* Set Privacy Mode */ err = hci_le_set_privacy_mode_sync(hdev, params); if (err) { bt_dev_err(hdev, "Unable to set privacy mode: %d", err); return err; } /* Check if already in accept list */ if (hci_bdaddr_list_lookup(&hdev->le_accept_list, &params->addr, params->addr_type)) return 0; *num_entries += 1; cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, &params->addr); err = __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) { bt_dev_err(hdev, "Unable to add to allow list: %d", err); /* Rollback the device from the resolving list */ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type); return err; } bt_dev_dbg(hdev, "Add %pMR (0x%x) to allow list", &cp.bdaddr, cp.bdaddr_type); return 0; } /* This function disables/pause all advertising instances */ static int hci_pause_advertising_sync(struct hci_dev *hdev) { int err; int old_state; /* If controller is not advertising we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; /* If already been paused there is nothing to do. */ if (hdev->advertising_paused) return 0; bt_dev_dbg(hdev, "Pausing directed advertising"); /* Stop directed advertising */ old_state = hci_dev_test_flag(hdev, HCI_ADVERTISING); if (old_state) { /* When discoverable timeout triggers, then just make sure * the limited discoverable flag is cleared. Even in the case * of a timeout triggered from general discoverable, it is * safe to unconditionally clear the flag. */ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hdev->discov_timeout = 0; } bt_dev_dbg(hdev, "Pausing advertising instances"); /* Call to disable any advertisements active on the controller. * This will succeed even if no advertisements are configured. */ err = hci_disable_advertising_sync(hdev); if (err) return err; /* If we are using software rotation, pause the loop */ if (!ext_adv_capable(hdev)) cancel_adv_timeout(hdev); hdev->advertising_paused = true; hdev->advertising_old_state = old_state; return 0; } /* This function enables all user advertising instances */ static int hci_resume_advertising_sync(struct hci_dev *hdev) { struct adv_info *adv, *tmp; int err; /* If advertising has not been paused there is nothing to do. */ if (!hdev->advertising_paused) return 0; /* Resume directed advertising */ hdev->advertising_paused = false; if (hdev->advertising_old_state) { hci_dev_set_flag(hdev, HCI_ADVERTISING); hdev->advertising_old_state = 0; } bt_dev_dbg(hdev, "Resuming advertising instances"); if (ext_adv_capable(hdev)) { /* Call for each tracked instance to be re-enabled */ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list) { err = hci_enable_ext_advertising_sync(hdev, adv->instance); if (!err) continue; /* If the instance cannot be resumed remove it */ hci_remove_ext_adv_instance_sync(hdev, adv->instance, NULL); } /* If current advertising instance is set to instance 0x00 * then we need to re-enable it. */ if (!hdev->cur_adv_instance) err = hci_enable_ext_advertising_sync(hdev, hdev->cur_adv_instance); } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop */ err = hci_schedule_adv_instance_sync(hdev, hdev->cur_adv_instance, true); } hdev->advertising_paused = false; return err; } static int hci_pause_addr_resolution(struct hci_dev *hdev) { int err; if (!ll_privacy_capable(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) return 0; /* Cannot disable addr resolution if scanning is enabled or * when initiating an LE connection. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) || hci_lookup_le_connect(hdev)) { bt_dev_err(hdev, "Command not allowed when scan/LE connect"); return -EPERM; } /* Cannot disable addr resolution if advertising is enabled. */ err = hci_pause_advertising_sync(hdev); if (err) { bt_dev_err(hdev, "Pause advertising failed: %d", err); return err; } err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00); if (err) bt_dev_err(hdev, "Unable to disable Address Resolution: %d", err); /* Return if address resolution is disabled and RPA is not used. */ if (!err && scan_use_rpa(hdev)) return 0; hci_resume_advertising_sync(hdev); return err; } struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, bool extended, struct sock *sk) { u16 opcode = extended ? HCI_OP_READ_LOCAL_OOB_EXT_DATA : HCI_OP_READ_LOCAL_OOB_DATA; return __hci_cmd_sync_sk(hdev, opcode, 0, NULL, 0, HCI_CMD_TIMEOUT, sk); } static struct conn_params *conn_params_copy(struct list_head *list, size_t *n) { struct hci_conn_params *params; struct conn_params *p; size_t i; rcu_read_lock(); i = 0; list_for_each_entry_rcu(params, list, action) ++i; *n = i; rcu_read_unlock(); p = kvcalloc(*n, sizeof(struct conn_params), GFP_KERNEL); if (!p) return NULL; rcu_read_lock(); i = 0; list_for_each_entry_rcu(params, list, action) { /* Racing adds are handled in next scan update */ if (i >= *n) break; /* No hdev->lock, but: addr, addr_type are immutable. * privacy_mode is only written by us or in * hci_cc_le_set_privacy_mode that we wait for. * We should be idempotent so MGMT updating flags * while we are processing is OK. */ bacpy(&p[i].addr, &params->addr); p[i].addr_type = params->addr_type; p[i].flags = READ_ONCE(params->flags); p[i].privacy_mode = READ_ONCE(params->privacy_mode); ++i; } rcu_read_unlock(); *n = i; return p; } /* Clear LE Accept List */ static int hci_le_clear_accept_list_sync(struct hci_dev *hdev) { if (!(hdev->commands[26] & 0x80)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL, HCI_CMD_TIMEOUT); } /* Device must not be scanning when updating the accept list. * * Update is done using the following sequence: * * ll_privacy_capable((Disable Advertising) -> Disable Resolving List) -> * Remove Devices From Accept List -> * (has IRK && ll_privacy_capable(Remove Devices From Resolving List))-> * Add Devices to Accept List -> * (has IRK && ll_privacy_capable(Remove Devices From Resolving List)) -> * ll_privacy_capable(Enable Resolving List -> (Enable Advertising)) -> * Enable Scanning * * In case of failure advertising shall be restored to its original state and * return would disable accept list since either accept or resolving list could * not be programmed. * */ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) { struct conn_params *params; struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; u8 filter_policy; size_t i, n; int err; /* Pause advertising if resolving list can be used as controllers * cannot accept resolving list modifications while advertising. */ if (ll_privacy_capable(hdev)) { err = hci_pause_advertising_sync(hdev); if (err) { bt_dev_err(hdev, "pause advertising failed: %d", err); return 0x00; } } /* Disable address resolution while reprogramming accept list since * devices that do have an IRK will be programmed in the resolving list * when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00); if (err) { bt_dev_err(hdev, "Unable to disable LL privacy: %d", err); goto done; } /* Force address filtering if PA Sync is in progress */ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { struct hci_conn *conn; conn = hci_conn_hash_lookup_create_pa_sync(hdev); if (conn) { struct conn_params pa; memset(&pa, 0, sizeof(pa)); bacpy(&pa.addr, &conn->dst); pa.addr_type = conn->dst_type; /* Clear first since there could be addresses left * behind. */ hci_le_clear_accept_list_sync(hdev); num_entries = 1; err = hci_le_add_accept_list_sync(hdev, &pa, &num_entries); goto done; } } /* Go through the current accept list programmed into the * controller one by one and check if that address is connected or is * still in the list of pending connections or list of devices to * report. If not present in either list, then remove it from * the controller. */ list_for_each_entry_safe(b, t, &hdev->le_accept_list, list) { if (hci_conn_hash_lookup_le(hdev, &b->bdaddr, b->bdaddr_type)) continue; /* Pointers not dereferenced, no locks needed */ pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, &b->bdaddr, b->bdaddr_type); pend_report = hci_pend_le_action_lookup(&hdev->pend_le_reports, &b->bdaddr, b->bdaddr_type); /* If the device is not likely to connect or report, * remove it from the acceptlist. */ if (!pend_conn && !pend_report) { hci_le_del_accept_list_sync(hdev, &b->bdaddr, b->bdaddr_type); continue; } num_entries++; } /* Since all no longer valid accept list entries have been * removed, walk through the list of pending connections * and ensure that any new device gets programmed into * the controller. * * If the list of the devices is larger than the list of * available accept list entries in the controller, then * just abort and return filer policy value to not use the * accept list. * * The list and params may be mutated while we wait for events, * so make a copy and iterate it. */ params = conn_params_copy(&hdev->pend_le_conns, &n); if (!params) { err = -ENOMEM; goto done; } for (i = 0; i < n; ++i) { err = hci_le_add_accept_list_sync(hdev, &params[i], &num_entries); if (err) { kvfree(params); goto done; } } kvfree(params); /* After adding all new pending connections, walk through * the list of pending reports and also add these to the * accept list if there is still space. Abort if space runs out. */ params = conn_params_copy(&hdev->pend_le_reports, &n); if (!params) { err = -ENOMEM; goto done; } for (i = 0; i < n; ++i) { err = hci_le_add_accept_list_sync(hdev, &params[i], &num_entries); if (err) { kvfree(params); goto done; } } kvfree(params); /* Use the allowlist unless the following conditions are all true: * - We are not currently suspending * - There are 1 or more ADV monitors registered and it's not offloaded * - Interleaved scanning is not currently using the allowlist */ if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended && hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE && hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST) err = -EINVAL; done: filter_policy = err ? 0x00 : 0x01; /* Enable address resolution when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01); if (err) bt_dev_err(hdev, "Unable to enable LL privacy: %d", err); /* Resume advertising if it was paused */ if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ return filter_policy; } static void hci_le_scan_phy_params(struct hci_cp_le_scan_phy_params *cp, u8 type, u16 interval, u16 window) { cp->type = type; cp->interval = cpu_to_le16(interval); cp->window = cpu_to_le16(window); } static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) { struct hci_cp_le_set_ext_scan_params *cp; struct hci_cp_le_scan_phy_params *phy; u8 data[sizeof(*cp) + sizeof(*phy) * 2]; u8 num_phy = 0x00; cp = (void *)data; phy = (void *)cp->data; memset(data, 0, sizeof(data)); cp->own_addr_type = own_addr_type; cp->filter_policy = filter_policy; /* Check if PA Sync is in progress then select the PHY based on the * hci_conn.iso_qos. */ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { struct hci_cp_le_add_to_accept_list *sent; sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST); if (sent) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, PA_LINK, &sent->bdaddr); if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; if (qos->bcast.in.phy & BT_ISO_PHY_1M || qos->bcast.in.phy & BT_ISO_PHY_2M) { cp->scanning_phys |= LE_SCAN_PHY_1M; hci_le_scan_phy_params(phy, type, interval, window); num_phy++; phy++; } if (qos->bcast.in.phy & BT_ISO_PHY_CODED) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } if (num_phy) goto done; } } } if (scan_1m(hdev) || scan_2m(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_1M; hci_le_scan_phy_params(phy, type, interval, window); num_phy++; phy++; } if (scan_coded(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } done: if (!num_phy) return -EINVAL; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS, sizeof(*cp) + sizeof(*phy) * num_phy, data, HCI_CMD_TIMEOUT); } static int hci_le_set_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) { struct hci_cp_le_set_scan_param cp; if (use_ext_scan(hdev)) return hci_le_set_ext_scan_param_sync(hdev, type, interval, window, own_addr_type, filter_policy); memset(&cp, 0, sizeof(cp)); cp.type = type; cp.interval = cpu_to_le16(interval); cp.window = cpu_to_le16(window); cp.own_address_type = own_addr_type; cp.filter_policy = filter_policy; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_start_scan_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy, u8 filter_dup) { int err; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_le_set_scan_param_sync(hdev, type, interval, window, own_addr_type, filter_policy); if (err) return err; return hci_le_set_scan_enable_sync(hdev, LE_SCAN_ENABLE, filter_dup); } static int hci_passive_scan_sync(struct hci_dev *hdev) { u8 own_addr_type; u8 filter_policy; u16 window, interval; u8 filter_dups = LE_SCAN_FILTER_DUP_ENABLE; int err; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_scan_disable_sync(hdev); if (err) { bt_dev_err(hdev, "disable scanning failed: %d", err); return err; } /* Set require_privacy to false since no SCAN_REQ are send * during passive scanning. Not using an non-resolvable address * here is important so that peer devices using direct * advertising with our address will be correctly reported * by the controller. */ if (hci_update_random_address_sync(hdev, false, scan_use_rpa(hdev), &own_addr_type)) return 0; if (hdev->enable_advmon_interleave_scan && hci_update_interleaved_scan_sync(hdev)) return 0; bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state); /* Adding or removing entries from the accept list must * happen before enabling scanning. The controller does * not allow accept list modification while scanning. */ filter_policy = hci_update_accept_list_sync(hdev); /* If suspended and filter_policy set to 0x00 (no acceptlist) then * passive scanning cannot be started since that would require the host * to be woken up to process the reports. */ if (hdev->suspended && !filter_policy) { /* Check if accept list is empty then there is no need to scan * while suspended. */ if (list_empty(&hdev->le_accept_list)) return 0; /* If there are devices is the accept_list that means some * devices could not be programmed which in non-suspended case * means filter_policy needs to be set to 0x00 so the host needs * to filter, but since this is treating suspended case we * can ignore device needing host to filter to allow devices in * the acceptlist to be able to wakeup the system. */ filter_policy = 0x01; } /* When the controller is using random resolvable addresses and * with that having LE privacy enabled, then controllers with * Extended Scanner Filter Policies support can now enable support * for handling directed advertising. * * So instead of using filter polices 0x00 (no acceptlist) * and 0x01 (acceptlist enabled) use the new filter policies * 0x02 (no acceptlist) and 0x03 (acceptlist enabled). */ if (hci_dev_test_flag(hdev, HCI_PRIVACY) && (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) filter_policy |= 0x02; if (hdev->suspended) { window = hdev->le_scan_window_suspend; interval = hdev->le_scan_int_suspend; } else if (hci_is_le_conn_scanning(hdev)) { window = hdev->le_scan_window_connect; interval = hdev->le_scan_int_connect; } else if (hci_is_adv_monitoring(hdev)) { window = hdev->le_scan_window_adv_monitor; interval = hdev->le_scan_int_adv_monitor; /* Disable duplicates filter when scanning for advertisement * monitor for the following reasons. * * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm * controllers ignore RSSI_Sampling_Period when the duplicates * filter is enabled. * * For SW pattern filtering, when we're not doing interleaved * scanning, it is necessary to disable duplicates filter, * otherwise hosts can only receive one advertisement and it's * impossible to know if a peer is still in range. */ filter_dups = LE_SCAN_FILTER_DUP_DISABLE; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; } /* Disable all filtering for Mesh */ if (hci_dev_test_flag(hdev, HCI_MESH)) { filter_policy = 0; filter_dups = LE_SCAN_FILTER_DUP_DISABLE; } bt_dev_dbg(hdev, "LE passive scan with acceptlist = %d", filter_policy); return hci_start_scan_sync(hdev, LE_SCAN_PASSIVE, interval, window, own_addr_type, filter_policy, filter_dups); } /* This function controls the passive scanning based on hdev->pend_le_conns * list. If there are pending LE connection we start the background scanning, * otherwise we stop it in the following sequence: * * If there are devices to scan: * * Disable Scanning -> Update Accept List -> * ll_privacy_capable((Disable Advertising) -> Disable Resolving List -> * Update Resolving List -> Enable Resolving List -> (Enable Advertising)) -> * Enable Scanning * * Otherwise: * * Disable Scanning */ int hci_update_passive_scan_sync(struct hci_dev *hdev) { int err; if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_AUTO_OFF) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* No point in doing scanning if LE support hasn't been enabled */ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; /* If discovery is active don't interfere with it */ if (hdev->discovery.state != DISCOVERY_STOPPED) return 0; /* Reset RSSI and UUID filters when starting background scanning * since these filters are meant for service discovery only. * * The Start Discovery and Start Service Discovery operations * ensure to set proper values for RSSI threshold and UUID * filter list. So it is safe to just reset them here. */ hci_discovery_filter_clear(hdev); bt_dev_dbg(hdev, "ADV monitoring is %s", hci_is_adv_monitoring(hdev) ? "on" : "off"); if (!hci_dev_test_flag(hdev, HCI_MESH) && list_empty(&hdev->pend_le_conns) && list_empty(&hdev->pend_le_reports) && !hci_is_adv_monitoring(hdev) && !hci_dev_test_flag(hdev, HCI_PA_SYNC)) { /* If there is no pending LE connections or devices * to be scanned for or no ADV monitors, we should stop the * background scanning. */ bt_dev_dbg(hdev, "stopping background scanning"); err = hci_scan_disable_sync(hdev); if (err) bt_dev_err(hdev, "stop background scanning failed: %d", err); } else { /* If there is at least one pending LE connection, we should * keep the background scan running. */ /* If controller is connecting, we should not start scanning * since some controllers are not able to scan and connect at * the same time. */ if (hci_lookup_le_connect(hdev)) return 0; bt_dev_dbg(hdev, "start background scanning"); err = hci_passive_scan_sync(hdev); if (err) bt_dev_err(hdev, "start background scanning failed: %d", err); } return err; } static int update_scan_sync(struct hci_dev *hdev, void *data) { return hci_update_scan_sync(hdev); } int hci_update_scan(struct hci_dev *hdev) { return hci_cmd_sync_queue(hdev, update_scan_sync, NULL, NULL); } static int update_passive_scan_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); } int hci_update_passive_scan(struct hci_dev *hdev) { /* Only queue if it would have any effect */ if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_AUTO_OFF) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL, NULL); } int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val) { int err; if (!bredr_sc_enabled(hdev) || lmp_host_sc_capable(hdev)) return 0; err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT, sizeof(val), &val, HCI_CMD_TIMEOUT); if (!err) { if (val) { hdev->features[1][0] |= LMP_HOST_SC; hci_dev_set_flag(hdev, HCI_SC_ENABLED); } else { hdev->features[1][0] &= ~LMP_HOST_SC; hci_dev_clear_flag(hdev, HCI_SC_ENABLED); } } return err; } int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode) { int err; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) || lmp_host_ssp_capable(hdev)) return 0; if (!mode && hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); if (err) return err; return hci_write_sc_support_sync(hdev, 0x01); } int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul) { struct hci_cp_write_le_host_supported cp; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) || !lmp_bredr_capable(hdev)) return 0; /* Check first if we already have the right host state * (host features set) */ if (le == lmp_host_le_capable(hdev) && simul == lmp_host_le_br_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); cp.le = le; cp.simul = simul; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_powered_update_adv_sync(struct hci_dev *hdev) { struct adv_info *adv, *tmp; int err; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; /* If RPA Resolution has not been enable yet it means the * resolving list is empty and we should attempt to program the * local IRK in order to support using own_addr_type * ADDR_LE_DEV_RANDOM_RESOLVED (0x03). */ if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { hci_le_add_resolve_list_sync(hdev, NULL); hci_le_set_addr_resolution_enable_sync(hdev, 0x01); } /* Make sure the controller has a good default for * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) { err = hci_setup_ext_adv_instance_sync(hdev, 0x00); if (!err) hci_update_scan_rsp_data_sync(hdev, 0x00); } else { err = hci_update_adv_data_sync(hdev, 0x00); if (!err) hci_update_scan_rsp_data_sync(hdev, 0x00); } if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) hci_enable_advertising_sync(hdev); } /* Call for each tracked instance to be scheduled */ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list) hci_schedule_adv_instance_sync(hdev, adv->instance, true); return 0; } static int hci_write_auth_enable_sync(struct hci_dev *hdev) { u8 link_sec; link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY); if (link_sec == test_bit(HCI_AUTH, &hdev->flags)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, sizeof(link_sec), &link_sec, HCI_CMD_TIMEOUT); } int hci_write_fast_connectable_sync(struct hci_dev *hdev, bool enable) { struct hci_cp_write_page_scan_activity cp; u8 type; int err = 0; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; memset(&cp, 0, sizeof(cp)); if (enable) { type = PAGE_SCAN_TYPE_INTERLACED; /* 160 msec page scan interval */ cp.interval = cpu_to_le16(0x0100); } else { type = hdev->def_page_scan_type; cp.interval = cpu_to_le16(hdev->def_page_scan_int); } cp.window = cpu_to_le16(hdev->def_page_scan_window); if (__cpu_to_le16(hdev->page_scan_interval) != cp.interval || __cpu_to_le16(hdev->page_scan_window) != cp.window) { err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) return err; } if (hdev->page_scan_type != type) err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_PAGE_SCAN_TYPE, sizeof(type), &type, HCI_CMD_TIMEOUT); return err; } static bool disconnected_accept_list_entries(struct hci_dev *hdev) { struct bdaddr_list *b; list_for_each_entry(b, &hdev->accept_list, list) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr); if (!conn) return true; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) return true; } return false; } static int hci_write_scan_enable_sync(struct hci_dev *hdev, u8 val) { return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, sizeof(val), &val, HCI_CMD_TIMEOUT); } int hci_update_scan_sync(struct hci_dev *hdev) { u8 scan; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (!hdev_is_powered(hdev)) return 0; if (mgmt_powering_down(hdev)) return 0; if (hdev->scanning_paused) return 0; if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) || disconnected_accept_list_entries(hdev)) scan = SCAN_PAGE; else scan = SCAN_DISABLED; if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) scan |= SCAN_INQUIRY; if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) && test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY)) return 0; return hci_write_scan_enable_sync(hdev, scan); } int hci_update_name_sync(struct hci_dev *hdev, const u8 *name) { struct hci_cp_write_local_name cp; memset(&cp, 0, sizeof(cp)); memcpy(cp.name, name, sizeof(cp.name)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* This function perform powered update HCI command sequence after the HCI init * sequence which end up resetting all states, the sequence is as follows: * * HCI_SSP_ENABLED(Enable SSP) * HCI_LE_ENABLED(Enable LE) * HCI_LE_ENABLED(ll_privacy_capable(Add local IRK to Resolving List) -> * Update adv data) * Enable Authentication * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class -> * Set Name -> Set EIR) * HCI_FORCE_STATIC_ADDR | BDADDR_ANY && !HCI_BREDR_ENABLED (Set Static Address) */ int hci_powered_update_sync(struct hci_dev *hdev) { int err; /* Register the available SMP channels (BR/EDR and LE) only when * successfully powering on the controller. This late * registration is required so that LE SMP can clearly decide if * the public address or static address is used. */ smp_register(hdev); err = hci_write_ssp_mode_sync(hdev, 0x01); if (err) return err; err = hci_write_le_host_supported_sync(hdev, 0x01, 0x00); if (err) return err; err = hci_powered_update_adv_sync(hdev); if (err) return err; err = hci_write_auth_enable_sync(hdev); if (err) return err; if (lmp_bredr_capable(hdev)) { if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) hci_write_fast_connectable_sync(hdev, true); else hci_write_fast_connectable_sync(hdev, false); hci_update_scan_sync(hdev); hci_update_class_sync(hdev); hci_update_name_sync(hdev, hdev->dev_name); hci_update_eir_sync(hdev); } /* If forcing static address is in use or there is no public * address use the static address as random address (but skip * the HCI command if the current random address is already the * static one. * * In case BR/EDR has been disabled on a dual-mode controller * and a static address has been configured, then use that * address instead of the public BR/EDR address. */ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))) { if (bacmp(&hdev->static_addr, BDADDR_ANY)) return hci_set_random_addr_sync(hdev, &hdev->static_addr); } return 0; } /** * hci_dev_get_bd_addr_from_property - Get the Bluetooth Device Address * (BD_ADDR) for a HCI device from * a firmware node property. * @hdev: The HCI device * * Search the firmware node for 'local-bd-address'. * * All-zero BD addresses are rejected, because those could be properties * that exist in the firmware tables, but were not updated by the firmware. For * example, the DTS could define 'local-bd-address', with zero BD addresses. */ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) { struct fwnode_handle *fwnode = dev_fwnode(hdev->dev.parent); bdaddr_t ba; int ret; ret = fwnode_property_read_u8_array(fwnode, "local-bd-address", (u8 *)&ba, sizeof(ba)); if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; if (hci_test_quirk(hdev, HCI_QUIRK_BDADDR_PROPERTY_BROKEN)) baswap(&hdev->public_addr, &ba); else bacpy(&hdev->public_addr, &ba); } struct hci_init_stage { int (*func)(struct hci_dev *hdev); }; /* Run init stage NULL terminated function table */ static int hci_init_stage_sync(struct hci_dev *hdev, const struct hci_init_stage *stage) { size_t i; for (i = 0; stage[i].func; i++) { int err; err = stage[i].func(hdev); if (err) return err; } return 0; } /* Read Local Version */ static int hci_read_local_version_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL, HCI_CMD_TIMEOUT); } /* Read BD Address */ static int hci_read_bd_addr_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_BD_ADDR, 0, NULL, HCI_CMD_TIMEOUT); } #define HCI_INIT(_func) \ { \ .func = _func, \ } static const struct hci_init_stage hci_init0[] = { /* HCI_OP_READ_LOCAL_VERSION */ HCI_INIT(hci_read_local_version_sync), /* HCI_OP_READ_BD_ADDR */ HCI_INIT(hci_read_bd_addr_sync), {} }; int hci_reset_sync(struct hci_dev *hdev) { int err; set_bit(HCI_RESET, &hdev->flags); err = __hci_cmd_sync_status(hdev, HCI_OP_RESET, 0, NULL, HCI_CMD_TIMEOUT); if (err) return err; return 0; } static int hci_init0_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); /* Reset */ if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) { err = hci_reset_sync(hdev); if (err) return err; } return hci_init_stage_sync(hdev, hci_init0); } static int hci_unconf_init_sync(struct hci_dev *hdev) { int err; if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) return 0; err = hci_init0_sync(hdev); if (err < 0) return err; if (hci_dev_test_flag(hdev, HCI_SETUP)) hci_debugfs_create_basic(hdev); return 0; } /* Read Local Supported Features. */ static int hci_read_local_features_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_FEATURES, 0, NULL, HCI_CMD_TIMEOUT); } /* BR Controller init stage 1 command sequence */ static const struct hci_init_stage br_init1[] = { /* HCI_OP_READ_LOCAL_FEATURES */ HCI_INIT(hci_read_local_features_sync), /* HCI_OP_READ_LOCAL_VERSION */ HCI_INIT(hci_read_local_version_sync), /* HCI_OP_READ_BD_ADDR */ HCI_INIT(hci_read_bd_addr_sync), {} }; /* Read Local Commands */ static int hci_read_local_cmds_sync(struct hci_dev *hdev) { /* All Bluetooth 1.2 and later controllers should support the * HCI command for reading the local supported commands. * * Unfortunately some controllers indicate Bluetooth 1.2 support, * but do not have support for this command. If that is the case, * the driver can quirk the behavior and skip reading the local * supported commands. */ if (hdev->hci_ver > BLUETOOTH_VER_1_1 && !hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS)) return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL, HCI_CMD_TIMEOUT); return 0; } static int hci_init1_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); /* Reset */ if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) { err = hci_reset_sync(hdev); if (err) return err; } return hci_init_stage_sync(hdev, br_init1); } /* Read Buffer Size (ACL mtu, max pkt, etc.) */ static int hci_read_buffer_size_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Class of Device */ static int hci_read_dev_class_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLASS_OF_DEV, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Local Name */ static int hci_read_local_name_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_NAME, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Voice Setting */ static int hci_read_voice_setting_sync(struct hci_dev *hdev) { if (!read_voice_setting_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_VOICE_SETTING, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Number of Supported IAC */ static int hci_read_num_supported_iac_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_NUM_SUPPORTED_IAC, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Current IAC LAP */ static int hci_read_current_iac_lap_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CURRENT_IAC_LAP, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type, u8 cond_type, bdaddr_t *bdaddr, u8 auto_accept) { struct hci_cp_set_event_filter cp; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; memset(&cp, 0, sizeof(cp)); cp.flt_type = flt_type; if (flt_type != HCI_FLT_CLEAR_ALL) { cp.cond_type = cond_type; bacpy(&cp.addr_conn_flt.bdaddr, bdaddr); cp.addr_conn_flt.auto_accept = auto_accept; } return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_FLT, flt_type == HCI_FLT_CLEAR_ALL ? sizeof(cp.flt_type) : sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_clear_event_filter_sync(struct hci_dev *hdev) { if (!hci_dev_test_flag(hdev, HCI_EVENT_FILTER_CONFIGURED)) return 0; /* In theory the state machine should not reach here unless * a hci_set_event_filter_sync() call succeeds, but we do * the check both for parity and as a future reminder. */ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00, BDADDR_ANY, 0x00); } /* Connection accept timeout ~20 secs */ static int hci_write_ca_timeout_sync(struct hci_dev *hdev) { __le16 param = cpu_to_le16(0x7d00); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CA_TIMEOUT, sizeof(param), &param, HCI_CMD_TIMEOUT); } /* Enable SCO flow control if supported */ static int hci_write_sync_flowctl_sync(struct hci_dev *hdev) { struct hci_cp_write_sync_flowctl cp; int err; /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */ if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) || !hci_test_quirk(hdev, HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED)) return 0; memset(&cp, 0, sizeof(cp)); cp.enable = 0x01; err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SYNC_FLOWCTL, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (!err) hci_dev_set_flag(hdev, HCI_SCO_FLOWCTL); return err; } /* BR Controller init stage 2 command sequence */ static const struct hci_init_stage br_init2[] = { /* HCI_OP_READ_BUFFER_SIZE */ HCI_INIT(hci_read_buffer_size_sync), /* HCI_OP_READ_CLASS_OF_DEV */ HCI_INIT(hci_read_dev_class_sync), /* HCI_OP_READ_LOCAL_NAME */ HCI_INIT(hci_read_local_name_sync), /* HCI_OP_READ_VOICE_SETTING */ HCI_INIT(hci_read_voice_setting_sync), /* HCI_OP_READ_NUM_SUPPORTED_IAC */ HCI_INIT(hci_read_num_supported_iac_sync), /* HCI_OP_READ_CURRENT_IAC_LAP */ HCI_INIT(hci_read_current_iac_lap_sync), /* HCI_OP_SET_EVENT_FLT */ HCI_INIT(hci_clear_event_filter_sync), /* HCI_OP_WRITE_CA_TIMEOUT */ HCI_INIT(hci_write_ca_timeout_sync), /* HCI_OP_WRITE_SYNC_FLOWCTL */ HCI_INIT(hci_write_sync_flowctl_sync), {} }; static int hci_write_ssp_mode_1_sync(struct hci_dev *hdev) { u8 mode = 0x01; if (!lmp_ssp_capable(hdev) || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; /* When SSP is available, then the host features page * should also be available as well. However some * controllers list the max_page as 0 as long as SSP * has not been enabled. To achieve proper debugging * output, force the minimum max_page to 1 at least. */ hdev->max_page = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } static int hci_write_eir_sync(struct hci_dev *hdev) { struct hci_cp_write_eir cp; if (!lmp_ssp_capable(hdev) || hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; memset(hdev->eir, 0, sizeof(hdev->eir)); memset(&cp, 0, sizeof(cp)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_write_inquiry_mode_sync(struct hci_dev *hdev) { u8 mode; if (!lmp_inq_rssi_capable(hdev) && !hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE)) return 0; /* If Extended Inquiry Result events are supported, then * they are clearly preferred over Inquiry Result with RSSI * events. */ mode = lmp_ext_inq_capable(hdev) ? 0x02 : 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_INQUIRY_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } static int hci_read_inq_rsp_tx_power_sync(struct hci_dev *hdev) { if (!lmp_inq_tx_pwr_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_local_ext_features_sync(struct hci_dev *hdev, u8 page) { struct hci_cp_read_local_ext_features cp; if (!lmp_ext_feat_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); cp.page = page; return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_read_local_ext_features_1_sync(struct hci_dev *hdev) { return hci_read_local_ext_features_sync(hdev, 0x01); } /* HCI Controller init stage 2 command sequence */ static const struct hci_init_stage hci_init2[] = { /* HCI_OP_READ_LOCAL_COMMANDS */ HCI_INIT(hci_read_local_cmds_sync), /* HCI_OP_WRITE_SSP_MODE */ HCI_INIT(hci_write_ssp_mode_1_sync), /* HCI_OP_WRITE_EIR */ HCI_INIT(hci_write_eir_sync), /* HCI_OP_WRITE_INQUIRY_MODE */ HCI_INIT(hci_write_inquiry_mode_sync), /* HCI_OP_READ_INQ_RSP_TX_POWER */ HCI_INIT(hci_read_inq_rsp_tx_power_sync), /* HCI_OP_READ_LOCAL_EXT_FEATURES */ HCI_INIT(hci_read_local_ext_features_1_sync), /* HCI_OP_WRITE_AUTH_ENABLE */ HCI_INIT(hci_write_auth_enable_sync), {} }; /* Read LE Buffer Size */ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) { /* Use Read LE Buffer Size V2 if supported */ if (iso_capable(hdev) && hdev->commands[41] & 0x20) return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE_V2, 0, NULL, HCI_CMD_TIMEOUT); return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Local Supported Features */ static int hci_le_read_local_features_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Supported States */ static int hci_le_read_supported_states_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL, HCI_CMD_TIMEOUT); } /* LE Controller init stage 2 command sequence */ static const struct hci_init_stage le_init2[] = { /* HCI_OP_LE_READ_LOCAL_FEATURES */ HCI_INIT(hci_le_read_local_features_sync), /* HCI_OP_LE_READ_BUFFER_SIZE */ HCI_INIT(hci_le_read_buffer_size_sync), /* HCI_OP_LE_READ_SUPPORTED_STATES */ HCI_INIT(hci_le_read_supported_states_sync), {} }; static int hci_init2_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init2); if (err) return err; if (lmp_bredr_capable(hdev)) { err = hci_init_stage_sync(hdev, br_init2); if (err) return err; } else { hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); } if (lmp_le_capable(hdev)) { err = hci_init_stage_sync(hdev, le_init2); if (err) return err; /* LE-only controllers have LE implicitly enabled */ if (!lmp_bredr_capable(hdev)) hci_dev_set_flag(hdev, HCI_LE_ENABLED); } return 0; } static int hci_set_event_mask_sync(struct hci_dev *hdev) { /* The second byte is 0xff instead of 0x9f (two reserved bits * disabled) since a Broadcom 1.2 dongle doesn't respond to the * command otherwise. */ u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 }; /* CSR 1.1 dongles does not accept any bitfield so don't try to set * any event mask for pre 1.2 devices. */ if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; if (lmp_bredr_capable(hdev)) { events[4] |= 0x01; /* Flow Specification Complete */ /* Don't set Disconnect Complete and mode change when * suspended as that would wakeup the host when disconnecting * due to suspend. */ if (hdev->suspended) { events[0] &= 0xef; events[2] &= 0xf7; } } else { /* Use a different default for LE-only devices */ memset(events, 0, sizeof(events)); events[1] |= 0x20; /* Command Complete */ events[1] |= 0x40; /* Command Status */ events[1] |= 0x80; /* Hardware Error */ /* If the controller supports the Disconnect command, enable * the corresponding event. In addition enable packet flow * control related events. */ if (hdev->commands[0] & 0x20) { /* Don't set Disconnect Complete when suspended as that * would wakeup the host when disconnecting due to * suspend. */ if (!hdev->suspended) events[0] |= 0x10; /* Disconnection Complete */ events[2] |= 0x04; /* Number of Completed Packets */ events[3] |= 0x02; /* Data Buffer Overflow */ } /* If the controller supports the Read Remote Version * Information command, enable the corresponding event. */ if (hdev->commands[2] & 0x80) events[1] |= 0x08; /* Read Remote Version Information * Complete */ if (hdev->le_features[0] & HCI_LE_ENCRYPTION) { events[0] |= 0x80; /* Encryption Change */ events[5] |= 0x80; /* Encryption Key Refresh Complete */ } } if (lmp_inq_rssi_capable(hdev) || hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE)) events[4] |= 0x02; /* Inquiry Result with RSSI */ if (lmp_ext_feat_capable(hdev)) events[4] |= 0x04; /* Read Remote Extended Features Complete */ if (lmp_esco_capable(hdev)) { events[5] |= 0x08; /* Synchronous Connection Complete */ events[5] |= 0x10; /* Synchronous Connection Changed */ } if (lmp_sniffsubr_capable(hdev)) events[5] |= 0x20; /* Sniff Subrating */ if (lmp_pause_enc_capable(hdev)) events[5] |= 0x80; /* Encryption Key Refresh Complete */ if (lmp_ext_inq_capable(hdev)) events[5] |= 0x40; /* Extended Inquiry Result */ if (lmp_no_flush_capable(hdev)) events[7] |= 0x01; /* Enhanced Flush Complete */ if (lmp_lsto_capable(hdev)) events[6] |= 0x80; /* Link Supervision Timeout Changed */ if (lmp_ssp_capable(hdev)) { events[6] |= 0x01; /* IO Capability Request */ events[6] |= 0x02; /* IO Capability Response */ events[6] |= 0x04; /* User Confirmation Request */ events[6] |= 0x08; /* User Passkey Request */ events[6] |= 0x10; /* Remote OOB Data Request */ events[6] |= 0x20; /* Simple Pairing Complete */ events[7] |= 0x04; /* User Passkey Notification */ events[7] |= 0x08; /* Keypress Notification */ events[7] |= 0x10; /* Remote Host Supported * Features Notification */ } if (lmp_le_capable(hdev)) events[7] |= 0x20; /* LE Meta-Event */ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK, sizeof(events), events, HCI_CMD_TIMEOUT); } static int hci_read_stored_link_key_sync(struct hci_dev *hdev) { struct hci_cp_read_stored_link_key cp; if (!(hdev->commands[6] & 0x20) || hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY)) return 0; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, BDADDR_ANY); cp.read_all = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_READ_STORED_LINK_KEY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_setup_link_policy_sync(struct hci_dev *hdev) { struct hci_cp_write_def_link_policy cp; u16 link_policy = 0; if (!(hdev->commands[5] & 0x10)) return 0; memset(&cp, 0, sizeof(cp)); if (lmp_rswitch_capable(hdev)) link_policy |= HCI_LP_RSWITCH; if (lmp_hold_capable(hdev)) link_policy |= HCI_LP_HOLD; if (lmp_sniff_capable(hdev)) link_policy |= HCI_LP_SNIFF; if (lmp_park_capable(hdev)) link_policy |= HCI_LP_PARK; cp.policy = cpu_to_le16(link_policy); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_read_page_scan_activity_sync(struct hci_dev *hdev) { if (!(hdev->commands[8] & 0x01)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev) { if (!(hdev->commands[18] & 0x04) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_page_scan_type_sync(struct hci_dev *hdev) { /* Some older Broadcom based Bluetooth 1.2 controllers do not * support the Read Page Scan Type command. Check support for * this command in the bit mask of supported commands. */ if (!(hdev->commands[13] & 0x01) || hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read features beyond page 1 if available */ static int hci_read_local_ext_features_all_sync(struct hci_dev *hdev) { u8 page; int err; if (!lmp_ext_feat_capable(hdev)) return 0; for (page = 2; page < HCI_MAX_PAGES && page <= hdev->max_page; page++) { err = hci_read_local_ext_features_sync(hdev, page); if (err) return err; } return 0; } /* HCI Controller init stage 3 command sequence */ static const struct hci_init_stage hci_init3[] = { /* HCI_OP_SET_EVENT_MASK */ HCI_INIT(hci_set_event_mask_sync), /* HCI_OP_READ_STORED_LINK_KEY */ HCI_INIT(hci_read_stored_link_key_sync), /* HCI_OP_WRITE_DEF_LINK_POLICY */ HCI_INIT(hci_setup_link_policy_sync), /* HCI_OP_READ_PAGE_SCAN_ACTIVITY */ HCI_INIT(hci_read_page_scan_activity_sync), /* HCI_OP_READ_DEF_ERR_DATA_REPORTING */ HCI_INIT(hci_read_def_err_data_reporting_sync), /* HCI_OP_READ_PAGE_SCAN_TYPE */ HCI_INIT(hci_read_page_scan_type_sync), /* HCI_OP_READ_LOCAL_EXT_FEATURES */ HCI_INIT(hci_read_local_ext_features_all_sync), {} }; static int hci_le_set_event_mask_sync(struct hci_dev *hdev) { u8 events[8]; if (!lmp_le_capable(hdev)) return 0; memset(events, 0, sizeof(events)); if (hdev->le_features[0] & HCI_LE_ENCRYPTION) events[0] |= 0x10; /* LE Long Term Key Request */ /* If controller supports the Connection Parameters Request * Link Layer Procedure, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_CONN_PARAM_REQ_PROC) /* LE Remote Connection Parameter Request */ events[0] |= 0x20; /* If the controller supports the Data Length Extension * feature, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ /* If the controller supports LL Privacy feature or LE Extended Adv, * enable the corresponding event. */ if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ /* Mark Device Privacy if Privacy Mode is supported */ if (privacy_mode_capable(hdev)) hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY; /* Mark Address Resolution if LL Privacy is supported */ if (ll_privacy_capable(hdev)) hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION; /* If the controller supports Extended Scanner Filter * Policies, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY) events[1] |= 0x04; /* LE Direct Advertising Report */ /* If the controller supports Channel Selection Algorithm #2 * feature, enable the corresponding event. */ if (hdev->le_features[1] & HCI_LE_CHAN_SEL_ALG2) events[2] |= 0x08; /* LE Channel Selection Algorithm */ /* If the controller supports the LE Set Scan Enable command, * enable the corresponding advertising report event. */ if (hdev->commands[26] & 0x08) events[0] |= 0x02; /* LE Advertising Report */ /* If the controller supports the LE Create Connection * command, enable the corresponding event. */ if (hdev->commands[26] & 0x10) events[0] |= 0x01; /* LE Connection Complete */ /* If the controller supports the LE Connection Update * command, enable the corresponding event. */ if (hdev->commands[27] & 0x04) events[0] |= 0x04; /* LE Connection Update Complete */ /* If the controller supports the LE Read Remote Used Features * command, enable the corresponding event. */ if (hdev->commands[27] & 0x20) /* LE Read Remote Used Features Complete */ events[0] |= 0x08; /* If the controller supports the LE Read Local P-256 * Public Key command, enable the corresponding event. */ if (hdev->commands[34] & 0x02) /* LE Read Local P-256 Public Key Complete */ events[0] |= 0x80; /* If the controller supports the LE Generate DHKey * command, enable the corresponding event. */ if (hdev->commands[34] & 0x04) events[1] |= 0x01; /* LE Generate DHKey Complete */ /* If the controller supports the LE Set Default PHY or * LE Set PHY commands, enable the corresponding event. */ if (hdev->commands[35] & (0x20 | 0x40)) events[1] |= 0x08; /* LE PHY Update Complete */ /* If the controller supports LE Set Extended Scan Parameters * and LE Set Extended Scan Enable commands, enable the * corresponding event. */ if (use_ext_scan(hdev)) events[1] |= 0x10; /* LE Extended Advertising Report */ /* If the controller supports the LE Extended Advertising * command, enable the corresponding event. */ if (ext_adv_capable(hdev)) events[2] |= 0x02; /* LE Advertising Set Terminated */ if (cis_capable(hdev)) { events[3] |= 0x01; /* LE CIS Established */ if (cis_peripheral_capable(hdev)) events[3] |= 0x02; /* LE CIS Request */ } if (bis_capable(hdev)) { events[1] |= 0x20; /* LE PA Report */ events[1] |= 0x40; /* LE PA Sync Established */ events[3] |= 0x04; /* LE Create BIG Complete */ events[3] |= 0x08; /* LE Terminate BIG Complete */ events[3] |= 0x10; /* LE BIG Sync Established */ events[3] |= 0x20; /* LE BIG Sync Loss */ events[4] |= 0x02; /* LE BIG Info Advertising Report */ } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events, HCI_CMD_TIMEOUT); } /* Read LE Advertising Channel TX Power */ static int hci_le_read_adv_tx_power_sync(struct hci_dev *hdev) { if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) { /* HCI TS spec forbids mixing of legacy and extended * advertising commands wherein READ_ADV_TX_POWER is * also included. So do not call it if extended adv * is supported otherwise controller will return * COMMAND_DISALLOWED for extended commands. */ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL, HCI_CMD_TIMEOUT); } return 0; } /* Read LE Min/Max Tx Power*/ static int hci_le_read_tx_power_sync(struct hci_dev *hdev) { if (!(hdev->commands[38] & 0x80) || hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_TRANSMIT_POWER, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Accept List Size */ static int hci_le_read_accept_list_size_sync(struct hci_dev *hdev) { if (!(hdev->commands[26] & 0x40)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ACCEPT_LIST_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Resolving List Size */ static int hci_le_read_resolv_list_size_sync(struct hci_dev *hdev) { if (!(hdev->commands[34] & 0x40)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_RESOLV_LIST_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Clear LE Resolving List */ static int hci_le_clear_resolv_list_sync(struct hci_dev *hdev) { if (!(hdev->commands[34] & 0x20)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL, HCI_CMD_TIMEOUT); } /* Set RPA timeout */ static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev) { __le16 timeout = cpu_to_le16(hdev->rpa_timeout); if (!(hdev->commands[35] & 0x04) || hci_test_quirk(hdev, HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT, sizeof(timeout), &timeout, HCI_CMD_TIMEOUT); } /* Read LE Maximum Data Length */ static int hci_le_read_max_data_len_sync(struct hci_dev *hdev) { if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Suggested Default Data Length */ static int hci_le_read_def_data_len_sync(struct hci_dev *hdev) { if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Number of Supported Advertising Sets */ static int hci_le_read_num_support_adv_sets_sync(struct hci_dev *hdev) { if (!ext_adv_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS, 0, NULL, HCI_CMD_TIMEOUT); } /* Write LE Host Supported */ static int hci_set_le_support_sync(struct hci_dev *hdev) { struct hci_cp_write_le_host_supported cp; /* LE-only devices do not support explicit enablement */ if (!lmp_bredr_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { cp.le = 0x01; cp.simul = 0x00; } if (cp.le == lmp_host_le_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* LE Set Host Feature */ static int hci_le_set_host_feature_sync(struct hci_dev *hdev) { struct hci_cp_le_set_host_feature cp; if (!iso_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ cp.bit_number = 32; cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* LE Controller init stage 3 command sequence */ static const struct hci_init_stage le_init3[] = { /* HCI_OP_LE_SET_EVENT_MASK */ HCI_INIT(hci_le_set_event_mask_sync), /* HCI_OP_LE_READ_ADV_TX_POWER */ HCI_INIT(hci_le_read_adv_tx_power_sync), /* HCI_OP_LE_READ_TRANSMIT_POWER */ HCI_INIT(hci_le_read_tx_power_sync), /* HCI_OP_LE_READ_ACCEPT_LIST_SIZE */ HCI_INIT(hci_le_read_accept_list_size_sync), /* HCI_OP_LE_CLEAR_ACCEPT_LIST */ HCI_INIT(hci_le_clear_accept_list_sync), /* HCI_OP_LE_READ_RESOLV_LIST_SIZE */ HCI_INIT(hci_le_read_resolv_list_size_sync), /* HCI_OP_LE_CLEAR_RESOLV_LIST */ HCI_INIT(hci_le_clear_resolv_list_sync), /* HCI_OP_LE_SET_RPA_TIMEOUT */ HCI_INIT(hci_le_set_rpa_timeout_sync), /* HCI_OP_LE_READ_MAX_DATA_LEN */ HCI_INIT(hci_le_read_max_data_len_sync), /* HCI_OP_LE_READ_DEF_DATA_LEN */ HCI_INIT(hci_le_read_def_data_len_sync), /* HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS */ HCI_INIT(hci_le_read_num_support_adv_sets_sync), /* HCI_OP_WRITE_LE_HOST_SUPPORTED */ HCI_INIT(hci_set_le_support_sync), /* HCI_OP_LE_SET_HOST_FEATURE */ HCI_INIT(hci_le_set_host_feature_sync), {} }; static int hci_init3_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init3); if (err) return err; if (lmp_le_capable(hdev)) return hci_init_stage_sync(hdev, le_init3); return 0; } static int hci_delete_stored_link_key_sync(struct hci_dev *hdev) { struct hci_cp_delete_stored_link_key cp; /* Some Broadcom based Bluetooth controllers do not support the * Delete Stored Link Key command. They are clearly indicating its * absence in the bit mask of supported commands. * * Check the supported commands and only if the command is marked * as supported send it. If not supported assume that the controller * does not have actual support for stored link keys which makes this * command redundant anyway. * * Some controllers indicate that they support handling deleting * stored link keys, but they don't. The quirk lets a driver * just disable this command. */ if (!(hdev->commands[6] & 0x80) || hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY)) return 0; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, BDADDR_ANY); cp.delete_all = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_DELETE_STORED_LINK_KEY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev) { u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; bool changed = false; /* Set event mask page 2 if the HCI command for it is supported */ if (!(hdev->commands[22] & 0x04)) return 0; /* If Connectionless Peripheral Broadcast central role is supported * enable all necessary events for it. */ if (lmp_cpb_central_capable(hdev)) { events[1] |= 0x40; /* Triggered Clock Capture */ events[1] |= 0x80; /* Synchronization Train Complete */ events[2] |= 0x08; /* Truncated Page Complete */ events[2] |= 0x20; /* CPB Channel Map Change */ changed = true; } /* If Connectionless Peripheral Broadcast peripheral role is supported * enable all necessary events for it. */ if (lmp_cpb_peripheral_capable(hdev)) { events[2] |= 0x01; /* Synchronization Train Received */ events[2] |= 0x02; /* CPB Receive */ events[2] |= 0x04; /* CPB Timeout */ events[2] |= 0x10; /* Peripheral Page Response Timeout */ changed = true; } /* Enable Authenticated Payload Timeout Expired event if supported */ if (lmp_ping_capable(hdev) || hdev->le_features[0] & HCI_LE_PING) { events[2] |= 0x80; changed = true; } /* Some Broadcom based controllers indicate support for Set Event * Mask Page 2 command, but then actually do not support it. Since * the default value is all bits set to zero, the command is only * required if the event mask has to be changed. In case no change * to the event mask is needed, skip this command. */ if (!changed) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK_PAGE_2, sizeof(events), events, HCI_CMD_TIMEOUT); } /* Read local codec list if the HCI command is supported */ static int hci_read_local_codecs_sync(struct hci_dev *hdev) { if (hdev->commands[45] & 0x04) hci_read_supported_codecs_v2(hdev); else if (hdev->commands[29] & 0x20) hci_read_supported_codecs(hdev); return 0; } /* Read local pairing options if the HCI command is supported */ static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev) { if (!(hdev->commands[41] & 0x08)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL, HCI_CMD_TIMEOUT); } /* Get MWS transport configuration if the HCI command is supported */ static int hci_get_mws_transport_config_sync(struct hci_dev *hdev) { if (!mws_transport_config_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL, HCI_CMD_TIMEOUT); } /* Check for Synchronization Train support */ static int hci_read_sync_train_params_sync(struct hci_dev *hdev) { if (!lmp_sync_train_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL, HCI_CMD_TIMEOUT); } /* Enable Secure Connections if supported and configured */ static int hci_write_sc_support_1_sync(struct hci_dev *hdev) { u8 support = 0x01; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) || !bredr_sc_enabled(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT, sizeof(support), &support, HCI_CMD_TIMEOUT); } /* Set erroneous data reporting if supported to the wideband speech * setting value */ static int hci_set_err_data_report_sync(struct hci_dev *hdev) { struct hci_cp_write_def_err_data_reporting cp; bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED); if (!(hdev->commands[18] & 0x08) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING)) return 0; if (enabled == hdev->err_data_reporting) return 0; memset(&cp, 0, sizeof(cp)); cp.err_data_reporting = enabled ? ERR_DATA_REPORTING_ENABLED : ERR_DATA_REPORTING_DISABLED; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static const struct hci_init_stage hci_init4[] = { /* HCI_OP_DELETE_STORED_LINK_KEY */ HCI_INIT(hci_delete_stored_link_key_sync), /* HCI_OP_SET_EVENT_MASK_PAGE_2 */ HCI_INIT(hci_set_event_mask_page_2_sync), /* HCI_OP_READ_LOCAL_CODECS */ HCI_INIT(hci_read_local_codecs_sync), /* HCI_OP_READ_LOCAL_PAIRING_OPTS */ HCI_INIT(hci_read_local_pairing_opts_sync), /* HCI_OP_GET_MWS_TRANSPORT_CONFIG */ HCI_INIT(hci_get_mws_transport_config_sync), /* HCI_OP_READ_SYNC_TRAIN_PARAMS */ HCI_INIT(hci_read_sync_train_params_sync), /* HCI_OP_WRITE_SC_SUPPORT */ HCI_INIT(hci_write_sc_support_1_sync), /* HCI_OP_WRITE_DEF_ERR_DATA_REPORTING */ HCI_INIT(hci_set_err_data_report_sync), {} }; /* Set Suggested Default Data Length to maximum if supported */ static int hci_le_set_write_def_data_len_sync(struct hci_dev *hdev) { struct hci_cp_le_write_def_data_len cp; if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; memset(&cp, 0, sizeof(cp)); cp.tx_len = cpu_to_le16(hdev->le_max_tx_len); cp.tx_time = cpu_to_le16(hdev->le_max_tx_time); return __hci_cmd_sync_status(hdev, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Set Default PHY parameters if command is supported, enables all supported * PHYs according to the LE Features bits. */ static int hci_le_set_default_phy_sync(struct hci_dev *hdev) { struct hci_cp_le_set_default_phy cp; if (!(hdev->commands[35] & 0x20)) { /* If the command is not supported it means only 1M PHY is * supported. */ hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; return 0; } memset(&cp, 0, sizeof(cp)); cp.all_phys = 0x00; cp.tx_phys = HCI_LE_SET_PHY_1M; cp.rx_phys = HCI_LE_SET_PHY_1M; /* Enables 2M PHY if supported */ if (le_2m_capable(hdev)) { cp.tx_phys |= HCI_LE_SET_PHY_2M; cp.rx_phys |= HCI_LE_SET_PHY_2M; } /* Enables Coded PHY if supported */ if (le_coded_capable(hdev)) { cp.tx_phys |= HCI_LE_SET_PHY_CODED; cp.rx_phys |= HCI_LE_SET_PHY_CODED; } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static const struct hci_init_stage le_init4[] = { /* HCI_OP_LE_WRITE_DEF_DATA_LEN */ HCI_INIT(hci_le_set_write_def_data_len_sync), /* HCI_OP_LE_SET_DEFAULT_PHY */ HCI_INIT(hci_le_set_default_phy_sync), {} }; static int hci_init4_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init4); if (err) return err; if (lmp_le_capable(hdev)) return hci_init_stage_sync(hdev, le_init4); return 0; } static int hci_init_sync(struct hci_dev *hdev) { int err; err = hci_init1_sync(hdev); if (err < 0) return err; if (hci_dev_test_flag(hdev, HCI_SETUP)) hci_debugfs_create_basic(hdev); err = hci_init2_sync(hdev); if (err < 0) return err; err = hci_init3_sync(hdev); if (err < 0) return err; err = hci_init4_sync(hdev); if (err < 0) return err; /* This function is only called when the controller is actually in * configured state. When the controller is marked as unconfigured, * this initialization procedure is not run. * * It means that it is possible that a controller runs through its * setup phase and then discovers missing settings. If that is the * case, then this function will not be called. It then will only * be called during the config phase. * * So only when in setup phase or config phase, create the debugfs * entries and register the SMP channels. */ if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) return 0; if (hci_dev_test_and_set_flag(hdev, HCI_DEBUGFS_CREATED)) return 0; hci_debugfs_create_common(hdev); if (lmp_bredr_capable(hdev)) hci_debugfs_create_bredr(hdev); if (lmp_le_capable(hdev)) hci_debugfs_create_le(hdev); return 0; } #define HCI_QUIRK_BROKEN(_quirk, _desc) { HCI_QUIRK_BROKEN_##_quirk, _desc } static const struct { unsigned long quirk; const char *desc; } hci_broken_table[] = { HCI_QUIRK_BROKEN(LOCAL_COMMANDS, "HCI Read Local Supported Commands not supported"), HCI_QUIRK_BROKEN(STORED_LINK_KEY, "HCI Delete Stored Link Key command is advertised, " "but not supported."), HCI_QUIRK_BROKEN(ERR_DATA_REPORTING, "HCI Read Default Erroneous Data Reporting command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER, "HCI Read Transmit Power Level command is advertised, " "but not supported."), HCI_QUIRK_BROKEN(FILTER_CLEAR_ALL, "HCI Set Event Filter command not supported."), HCI_QUIRK_BROKEN(ENHANCED_SETUP_SYNC_CONN, "HCI Enhanced Setup Synchronous Connection command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT, "HCI LE Set Random Private Address Timeout command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(EXT_CREATE_CONN, "HCI LE Extended Create Connection command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(WRITE_AUTH_PAYLOAD_TIMEOUT, "HCI WRITE AUTH PAYLOAD TIMEOUT command leads " "to unexpected SMP errors when pairing " "and will not be used."), HCI_QUIRK_BROKEN(LE_CODED, "HCI LE Coded PHY feature bit is set, " "but its usage is not supported.") }; /* This function handles hdev setup stage: * * Calls hdev->setup * Setup address if HCI_QUIRK_USE_BDADDR_PROPERTY is set. */ static int hci_dev_setup_sync(struct hci_dev *hdev) { int ret = 0; bool invalid_bdaddr; size_t i; if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP)) return 0; bt_dev_dbg(hdev, ""); hci_sock_dev_event(hdev, HCI_DEV_SETUP); if (hdev->setup) ret = hdev->setup(hdev); for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) { if (hci_test_quirk(hdev, hci_broken_table[i].quirk)) bt_dev_warn(hdev, "%s", hci_broken_table[i].desc); } /* The transport driver can set the quirk to mark the * BD_ADDR invalid before creating the HCI device or in * its setup callback. */ invalid_bdaddr = hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) || hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY); if (!ret) { if (hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) && !bacmp(&hdev->public_addr, BDADDR_ANY)) hci_dev_get_bd_addr_from_property(hdev); if (invalid_bdaddr && bacmp(&hdev->public_addr, BDADDR_ANY) && hdev->set_bdaddr) { ret = hdev->set_bdaddr(hdev, &hdev->public_addr); if (!ret) invalid_bdaddr = false; } } /* The transport driver can set these quirks before * creating the HCI device or in its setup callback. * * For the invalid BD_ADDR quirk it is possible that * it becomes a valid address if the bootloader does * provide it (see above). * * In case any of them is set, the controller has to * start up as unconfigured. */ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) || invalid_bdaddr) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* For an unconfigured controller it is required to * read at least the version information provided by * the Read Local Version Information command. * * If the set_bdaddr driver callback is provided, then * also the original Bluetooth public device address * will be read using the Read BD Address command. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return hci_unconf_init_sync(hdev); return ret; } /* This function handles hdev init stage: * * Calls hci_dev_setup_sync to perform setup stage * Calls hci_init_sync to perform HCI command init sequence */ static int hci_dev_init_sync(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); ret = hci_dev_setup_sync(hdev); if (hci_dev_test_flag(hdev, HCI_CONFIG)) { /* If public address change is configured, ensure that * the address gets programmed. If the driver does not * support changing the public address, fail the power * on procedure. */ if (bacmp(&hdev->public_addr, BDADDR_ANY) && hdev->set_bdaddr) ret = hdev->set_bdaddr(hdev, &hdev->public_addr); else ret = -EADDRNOTAVAIL; } if (!ret) { if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = hci_init_sync(hdev); if (!ret && hdev->post_init) ret = hdev->post_init(hdev); } } /* If the HCI Reset command is clearing all diagnostic settings, * then they need to be reprogrammed after the init procedure * completed. */ if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { msft_do_open(hdev); aosp_do_open(hdev); } clear_bit(HCI_INIT, &hdev->flags); return ret; } int hci_dev_open_sync(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { ret = -ENODEV; goto done; } if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { /* Check for rfkill but allow the HCI setup stage to * proceed (which in itself doesn't cause any RF activity). */ if (hci_dev_test_flag(hdev, HCI_RFKILLED)) { ret = -ERFKILL; goto done; } /* Check for valid public address or a configured static * random address, but let the HCI setup proceed to * be able to determine if there is a public address * or not. * * In case of user channel usage, it is not important * if a public address or static random address is * available. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { ret = -EADDRNOTAVAIL; goto done; } } if (test_bit(HCI_UP, &hdev->flags)) { ret = -EALREADY; goto done; } if (hdev->open(hdev)) { ret = -EIO; goto done; } hci_devcd_reset(hdev); set_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_OPEN); ret = hci_dev_init_sync(hdev); if (!ret) { hci_dev_hold(hdev); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); hci_adv_instances_set_rpa_expired(hdev, true); set_bit(HCI_UP, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_UP); hci_leds_update_powered(hdev, true); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG) && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) { ret = hci_powered_update_sync(hdev); mgmt_power_on(hdev, ret); } } else { /* Init failed, cleanup */ flush_work(&hdev->tx_work); /* Since hci_rx_work() is possible to awake new cmd_work * it should be flushed first to avoid unexpected call of * hci_cmd_work() */ flush_work(&hdev->rx_work); flush_work(&hdev->cmd_work); skb_queue_purge(&hdev->cmd_q); skb_queue_purge(&hdev->rx_q); if (hdev->flush) hdev->flush(hdev); if (hdev->sent_cmd) { cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } if (hdev->req_skb) { kfree_skb(hdev->req_skb); hdev->req_skb = NULL; } clear_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_CLOSE); hdev->close(hdev); hdev->flags &= BIT(HCI_RAW); } done: return ret; } /* This function requires the caller holds hdev->lock */ static void hci_pend_le_actions_clear(struct hci_dev *hdev) { struct hci_conn_params *p; list_for_each_entry(p, &hdev->le_conn_params, list) { hci_pend_le_list_del_init(p); if (p->conn) { hci_conn_drop(p->conn); hci_conn_put(p->conn); p->conn = NULL; } } BT_DBG("All LE pending actions cleared"); } static int hci_dev_shutdown(struct hci_dev *hdev) { int err = 0; /* Similar to how we first do setup and then set the exclusive access * bit for userspace, we must first unset userchannel and then clean up. * Otherwise, the kernel can't properly use the hci channel to clean up * the controller (some shutdown routines require sending additional * commands to the controller for example). */ bool was_userchannel = hci_dev_test_and_clear_flag(hdev, HCI_USER_CHANNEL); if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && test_bit(HCI_UP, &hdev->flags)) { /* Execute vendor specific shutdown routine */ if (hdev->shutdown) err = hdev->shutdown(hdev); } if (was_userchannel) hci_dev_set_flag(hdev, HCI_USER_CHANNEL); return err; } int hci_dev_close_sync(struct hci_dev *hdev) { bool auto_off; int err = 0; bt_dev_dbg(hdev, ""); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work(&hdev->power_off); disable_delayed_work(&hdev->ncmd_timer); disable_delayed_work(&hdev->le_scan_disable); } else { cancel_delayed_work(&hdev->power_off); cancel_delayed_work(&hdev->ncmd_timer); cancel_delayed_work(&hdev->le_scan_disable); } hci_cmd_sync_cancel_sync(hdev, ENODEV); cancel_interleave_scan(hdev); if (hdev->adv_instance_timeout) { cancel_delayed_work_sync(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } err = hci_dev_shutdown(hdev); if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); return err; } hci_leds_update_powered(hdev, false); /* Flush RX and TX works */ flush_work(&hdev->tx_work); flush_work(&hdev->rx_work); if (hdev->discov_timeout > 0) { hdev->discov_timeout = 0; hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); } if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) cancel_delayed_work(&hdev->service_cache); if (hci_dev_test_flag(hdev, HCI_MGMT)) { struct adv_info *adv_instance; cancel_delayed_work_sync(&hdev->rpa_expired); list_for_each_entry(adv_instance, &hdev->adv_instances, list) cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); } /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ drain_workqueue(hdev->workqueue); hci_dev_lock(hdev); hci_discovery_set_state(hdev, DISCOVERY_STOPPED); auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); if (!auto_off && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */ smp_unregister(hdev); hci_dev_unlock(hdev); hci_sock_dev_event(hdev, HCI_DEV_DOWN); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { aosp_do_close(hdev); msft_do_close(hdev); } if (hdev->flush) hdev->flush(hdev); /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) && !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); hci_reset_sync(hdev); clear_bit(HCI_INIT, &hdev->flags); } /* flush cmd work */ flush_work(&hdev->cmd_work); /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); skb_queue_purge(&hdev->raw_q); /* Drop last sent command */ if (hdev->sent_cmd) { cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } /* Drop last request */ if (hdev->req_skb) { kfree_skb(hdev->req_skb); hdev->req_skb = NULL; } clear_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_CLOSE); /* After this point our queues are empty and no tasks are scheduled. */ hdev->close(hdev); /* Clear flags */ hdev->flags &= BIT(HCI_RAW); hci_dev_clear_volatile_flags(hdev); memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); hci_codec_list_clear(&hdev->local_codecs); hci_dev_put(hdev); return err; } /* This function perform power on HCI command sequence as follows: * * If controller is already up (HCI_UP) performs hci_powered_update_sync * sequence otherwise run hci_dev_open_sync which will follow with * hci_powered_update_sync after the init sequence is completed. */ static int hci_power_on_sync(struct hci_dev *hdev) { int err; if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); return hci_powered_update_sync(hdev); } err = hci_dev_open_sync(hdev); if (err < 0) return err; /* During the HCI setup phase, a few error conditions are * ignored and they need to be checked now. If they are still * valid, it is important to return the device back off. */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_close_sync(hdev); } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send * the Index Added event. For unconfigured devices, * it will send Unconfigued Index Added event. * * Devices with HCI_QUIRK_RAW_DEVICE are ignored * and no event will be send. */ mgmt_index_added(hdev); } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only * happens with the transition from unconfigured to * configured. This will send the Index Added event. */ mgmt_index_added(hdev); } return 0; } static int hci_remote_name_cancel_sync(struct hci_dev *hdev, bdaddr_t *addr) { struct hci_cp_remote_name_req_cancel cp; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, addr); return __hci_cmd_sync_status(hdev, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_stop_discovery_sync(struct hci_dev *hdev) { struct discovery_state *d = &hdev->discovery; struct inquiry_entry *e; int err; bt_dev_dbg(hdev, "state %u", hdev->discovery.state); if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) { if (test_bit(HCI_INQUIRY, &hdev->flags)) { err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); if (err) return err; } if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { cancel_delayed_work(&hdev->le_scan_disable); err = hci_scan_disable_sync(hdev); if (err) return err; } } else { err = hci_scan_disable_sync(hdev); if (err) return err; } /* Resume advertising if it was paused */ if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* No further actions needed for LE-only discovery */ if (d->type == DISCOV_TYPE_LE) return 0; if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) { e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_PENDING); if (!e) return 0; /* Ignore cancel errors since it should interfere with stopping * of the discovery. */ hci_remote_name_cancel_sync(hdev, &e->data.bdaddr); } return 0; } static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_disconnect cp; if (conn->type == BIS_LINK || conn->type == PA_LINK) { /* This is a BIS connection, hci_conn_del will * do the necessary cleanup. */ hci_dev_lock(hdev); hci_conn_failed(conn, reason); hci_dev_unlock(hdev); return 0; } memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; /* Wait for HCI_EV_DISCONN_COMPLETE, not HCI_EV_CMD_STATUS, when the * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is * used when suspending or powering off, where we don't want to wait * for the peer's response. */ if (reason != HCI_ERROR_REMOTE_POWER_OFF) return __hci_cmd_sync_status_sk(hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp, HCI_EV_DISCONN_COMPLETE, HCI_CMD_TIMEOUT, NULL); return __hci_cmd_sync_status(hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { /* Return reason if scanning since the connection shall probably be * cleanup directly. */ if (test_bit(HCI_CONN_SCANNING, &conn->flags)) return reason; if (conn->role == HCI_ROLE_SLAVE || test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { if (conn->type == LE_LINK) return hci_le_connect_cancel_sync(hdev, conn, reason); if (conn->type == CIS_LINK) { /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 1857: * * If this command is issued for a CIS on the Central and the * CIS is successfully terminated before being established, * then an HCI_LE_CIS_Established event shall also be sent for * this CIS with the Status Operation Cancelled by Host (0x44). */ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) return hci_disconnect_sync(hdev, conn, reason); /* CIS with no Create CIS sent have nothing to cancel */ return HCI_ERROR_LOCAL_HOST_TERM; } if (conn->type == BIS_LINK || conn->type == PA_LINK) { /* There is no way to cancel a BIS without terminating the BIG * which is done later on connection cleanup. */ return 0; } if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; /* Wait for HCI_EV_CONN_COMPLETE, not HCI_EV_CMD_STATUS, when the * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is * used when suspending or powering off, where we don't want to wait * for the peer's response. */ if (reason != HCI_ERROR_REMOTE_POWER_OFF) return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN_CANCEL, 6, &conn->dst, HCI_EV_CONN_COMPLETE, HCI_CMD_TIMEOUT, NULL); return __hci_cmd_sync_status(hdev, HCI_OP_CREATE_CONN_CANCEL, 6, &conn->dst, HCI_CMD_TIMEOUT); } static int hci_reject_sco_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_reject_sync_conn_req cp; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.reason = reason; /* SCO rejection has its own limited set of * allowed error values (0x0D-0x0F). */ if (reason < 0x0d || reason > 0x0f) cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_reject_cis_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_le_reject_cis cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_reject_conn_req cp; if (conn->type == CIS_LINK) return hci_le_reject_cis_sync(hdev, conn, reason); if (conn->type == BIS_LINK || conn->type == PA_LINK) return -EINVAL; if (conn->type == SCO_LINK || conn->type == ESCO_LINK) return hci_reject_sco_sync(hdev, conn, reason); memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { int err = 0; u16 handle = conn->handle; bool disconnect = false; struct hci_conn *c; switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: err = hci_disconnect_sync(hdev, conn, reason); break; case BT_CONNECT: err = hci_connect_cancel_sync(hdev, conn, reason); break; case BT_CONNECT2: err = hci_reject_conn_sync(hdev, conn, reason); break; case BT_OPEN: case BT_BOUND: break; default: disconnect = true; break; } hci_dev_lock(hdev); /* Check if the connection has been cleaned up concurrently */ c = hci_conn_hash_lookup_handle(hdev, handle); if (!c || c != conn) { err = 0; goto unlock; } /* Cleanup hci_conn object if it cannot be cancelled as it * likely means the controller and host stack are out of sync * or in case of LE it was still scanning so it can be cleanup * safely. */ if (disconnect) { conn->state = BT_CLOSED; hci_disconn_cfm(conn, reason); hci_conn_del(conn); } else { hci_conn_failed(conn, reason); } unlock: hci_dev_unlock(hdev); return err; } static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) { struct list_head *head = &hdev->conn_hash.list; struct hci_conn *conn; rcu_read_lock(); while ((conn = list_first_or_null_rcu(head, struct hci_conn, list))) { /* Make sure the connection is not freed while unlocking */ conn = hci_conn_get(conn); rcu_read_unlock(); /* Disregard possible errors since hci_conn_del shall have been * called even in case of errors had occurred since it would * then cause hci_conn_failed to be called which calls * hci_conn_del internally. */ hci_abort_conn_sync(hdev, conn, reason); hci_conn_put(conn); rcu_read_lock(); } rcu_read_unlock(); return 0; } /* This function perform power off HCI command sequence as follows: * * Clear Advertising * Stop Discovery * Disconnect all connections * hci_dev_close_sync */ static int hci_power_off_sync(struct hci_dev *hdev) { int err; /* If controller is already down there is nothing to do */ if (!test_bit(HCI_UP, &hdev->flags)) return 0; hci_dev_set_flag(hdev, HCI_POWERING_DOWN); if (test_bit(HCI_ISCAN, &hdev->flags) || test_bit(HCI_PSCAN, &hdev->flags)) { err = hci_write_scan_enable_sync(hdev, 0x00); if (err) goto out; } err = hci_clear_adv_sync(hdev, NULL, false); if (err) goto out; err = hci_stop_discovery_sync(hdev); if (err) goto out; /* Terminated due to Power Off */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); if (err) goto out; err = hci_dev_close_sync(hdev); out: hci_dev_clear_flag(hdev, HCI_POWERING_DOWN); return err; } int hci_set_powered_sync(struct hci_dev *hdev, u8 val) { if (val) return hci_power_on_sync(hdev); return hci_power_off_sync(hdev); } static int hci_write_iac_sync(struct hci_dev *hdev) { struct hci_cp_write_current_iac_lap cp; if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) return 0; memset(&cp, 0, sizeof(cp)); if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { /* Limited discoverable mode */ cp.num_iac = min_t(u8, hdev->num_iac, 2); cp.iac_lap[0] = 0x00; /* LIAC */ cp.iac_lap[1] = 0x8b; cp.iac_lap[2] = 0x9e; cp.iac_lap[3] = 0x33; /* GIAC */ cp.iac_lap[4] = 0x8b; cp.iac_lap[5] = 0x9e; } else { /* General discoverable mode */ cp.num_iac = 1; cp.iac_lap[0] = 0x33; /* GIAC */ cp.iac_lap[1] = 0x8b; cp.iac_lap[2] = 0x9e; } return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CURRENT_IAC_LAP, (cp.num_iac * 3) + 1, &cp, HCI_CMD_TIMEOUT); } int hci_update_discoverable_sync(struct hci_dev *hdev) { int err = 0; if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = hci_write_iac_sync(hdev); if (err) return err; err = hci_update_scan_sync(hdev); if (err) return err; err = hci_update_class_sync(hdev); if (err) return err; } /* Advertising instances don't use the global discoverable setting, so * only update AD if advertising was enabled using Set Advertising. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) { err = hci_update_adv_data_sync(hdev, 0x00); if (err) return err; /* Discoverable mode affects the local advertising * address in limited privacy mode. */ if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) { if (ext_adv_capable(hdev)) err = hci_start_ext_adv_sync(hdev, 0x00); else err = hci_enable_advertising_sync(hdev); } } return err; } static int update_discoverable_sync(struct hci_dev *hdev, void *data) { return hci_update_discoverable_sync(hdev); } int hci_update_discoverable(struct hci_dev *hdev) { /* Only queue if it would have any effect */ if (hdev_is_powered(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING) && hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) return hci_cmd_sync_queue(hdev, update_discoverable_sync, NULL, NULL); return 0; } int hci_update_connectable_sync(struct hci_dev *hdev) { int err; err = hci_update_scan_sync(hdev); if (err) return err; /* If BR/EDR is not enabled and we disable advertising as a * by-product of disabling connectable, we need to update the * advertising flags. */ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) err = hci_update_adv_data_sync(hdev, hdev->cur_adv_instance); /* Update the advertising parameters if necessary */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || !list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) err = hci_start_ext_adv_sync(hdev, hdev->cur_adv_instance); else err = hci_enable_advertising_sync(hdev); if (err) return err; } return hci_update_passive_scan_sync(hdev); } int hci_inquiry_sync(struct hci_dev *hdev, u8 length, u8 num_rsp) { const u8 giac[3] = { 0x33, 0x8b, 0x9e }; const u8 liac[3] = { 0x00, 0x8b, 0x9e }; struct hci_cp_inquiry cp; bt_dev_dbg(hdev, ""); if (test_bit(HCI_INQUIRY, &hdev->flags)) return 0; hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_dev_unlock(hdev); memset(&cp, 0, sizeof(cp)); if (hdev->discovery.limited) memcpy(&cp.lap, liac, sizeof(cp.lap)); else memcpy(&cp.lap, giac, sizeof(cp.lap)); cp.length = length; cp.num_rsp = num_rsp; return __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval) { u8 own_addr_type; /* Accept list is not used for discovery */ u8 filter_policy = 0x00; /* Default is to enable duplicates filter */ u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE; int err; bt_dev_dbg(hdev, ""); /* If controller is scanning, it means the passive scanning is * running. Thus, we should temporarily stop it in order to set the * discovery scanning parameters. */ err = hci_scan_disable_sync(hdev); if (err) { bt_dev_err(hdev, "Unable to disable scanning: %d", err); return err; } cancel_interleave_scan(hdev); /* Pause address resolution for active scan and stop advertising if * privacy is enabled. */ err = hci_pause_addr_resolution(hdev); if (err) goto failed; /* All active scans will be done with either a resolvable private * address (when privacy feature has been enabled) or non-resolvable * private address. */ err = hci_update_random_address_sync(hdev, true, scan_use_rpa(hdev), &own_addr_type); if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; if (hci_is_adv_monitoring(hdev) || (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER) && hdev->discovery.result_filtering)) { /* Duplicate filter should be disabled when some advertisement * monitor is activated, otherwise AdvMon can only receive one * advertisement for one peer(*) during active scanning, and * might report loss to these peers. * * If controller does strict duplicate filtering and the * discovery requires result filtering disables controller based * filtering since that can cause reports that would match the * host filter to not be reported. */ filter_dup = LE_SCAN_FILTER_DUP_DISABLE; } err = hci_start_scan_sync(hdev, LE_SCAN_ACTIVE, interval, hdev->le_scan_window_discovery, own_addr_type, filter_policy, filter_dup); if (!err) return err; failed: /* Resume advertising if it was paused */ if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* Resume passive scanning */ hci_update_passive_scan_sync(hdev); return err; } static int hci_start_interleaved_discovery_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery * 2); if (err) return err; return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0); } int hci_start_discovery_sync(struct hci_dev *hdev) { unsigned long timeout; int err; bt_dev_dbg(hdev, "type %u", hdev->discovery.type); switch (hdev->discovery.type) { case DISCOV_TYPE_BREDR: return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0); case DISCOV_TYPE_INTERLEAVED: /* When running simultaneous discovery, the LE scanning time * should occupy the whole discovery time sine BR/EDR inquiry * and LE scanning are scheduled by the controller. * * For interleaving discovery in comparison, BR/EDR inquiry * and LE scanning are done sequentially with separate * timeouts. */ if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) { timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); /* During simultaneous discovery, we double LE scan * interval. We must leave some time for the controller * to do BR/EDR inquiry. */ err = hci_start_interleaved_discovery_sync(hdev); break; } timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery); break; case DISCOV_TYPE_LE: timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery); break; default: return -EINVAL; } if (err) return err; bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout)); queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable, timeout); return 0; } static void hci_suspend_monitor_sync(struct hci_dev *hdev) { switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_MSFT: msft_suspend_sync(hdev); break; default: return; } } /* This function disables discovery and mark it as paused */ static int hci_pause_discovery_sync(struct hci_dev *hdev) { int old_state = hdev->discovery.state; int err; /* If discovery already stopped/stopping/paused there nothing to do */ if (old_state == DISCOVERY_STOPPED || old_state == DISCOVERY_STOPPING || hdev->discovery_paused) return 0; hci_discovery_set_state(hdev, DISCOVERY_STOPPING); err = hci_stop_discovery_sync(hdev); if (err) return err; hdev->discovery_paused = true; hci_discovery_set_state(hdev, DISCOVERY_STOPPED); return 0; } static int hci_update_event_filter_sync(struct hci_dev *hdev) { struct bdaddr_list_with_flags *b; u8 scan = SCAN_DISABLED; bool scanning = test_bit(HCI_PSCAN, &hdev->flags); int err; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; /* Some fake CSR controllers lock up after setting this type of * filter, so avoid sending the request altogether. */ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; /* Always clear event filter when starting */ hci_clear_event_filter_sync(hdev); list_for_each_entry(b, &hdev->accept_list, list) { if (!(b->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) continue; bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr); err = hci_set_event_filter_sync(hdev, HCI_FLT_CONN_SETUP, HCI_CONN_SETUP_ALLOW_BDADDR, &b->bdaddr, HCI_CONN_SETUP_AUTO_ON); if (err) bt_dev_err(hdev, "Failed to set event filter for %pMR", &b->bdaddr); else scan = SCAN_PAGE; } if (scan && !scanning) hci_write_scan_enable_sync(hdev, scan); else if (!scan && scanning) hci_write_scan_enable_sync(hdev, scan); return 0; } /* This function disables scan (BR and LE) and mark it as paused */ static int hci_pause_scan_sync(struct hci_dev *hdev) { if (hdev->scanning_paused) return 0; /* Disable page scan if enabled */ if (test_bit(HCI_PSCAN, &hdev->flags)) hci_write_scan_enable_sync(hdev, SCAN_DISABLED); hci_scan_disable_sync(hdev); hdev->scanning_paused = true; return 0; } /* This function performs the HCI suspend procedures in the follow order: * * Pause discovery (active scanning/inquiry) * Pause Directed Advertising/Advertising * Pause Scanning (passive scanning in case discovery was not active) * Disconnect all connections * Set suspend_status to BT_SUSPEND_DISCONNECT if hdev cannot wakeup * otherwise: * Update event mask (only set events that are allowed to wake up the host) * Update event filter (with devices marked with HCI_CONN_FLAG_REMOTE_WAKEUP) * Update passive scanning (lower duty cycle) * Set suspend_status to BT_SUSPEND_CONFIGURE_WAKE */ int hci_suspend_sync(struct hci_dev *hdev) { int err; /* If marked as suspended there nothing to do */ if (hdev->suspended) return 0; /* Mark device as suspended */ hdev->suspended = true; /* Pause discovery if not already stopped */ hci_pause_discovery_sync(hdev); /* Pause other advertisements */ hci_pause_advertising_sync(hdev); /* Suspend monitor filters */ hci_suspend_monitor_sync(hdev); /* Prevent disconnects from causing scanning to be re-enabled */ hci_pause_scan_sync(hdev); if (hci_conn_count(hdev)) { /* Soft disconnect everything (power off) */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); if (err) { /* Set state to BT_RUNNING so resume doesn't notify */ hdev->suspend_state = BT_RUNNING; hci_resume_sync(hdev); return err; } /* Update event mask so only the allowed event can wakeup the * host. */ hci_set_event_mask_sync(hdev); } /* Only configure accept list if disconnect succeeded and wake * isn't being prevented. */ if (!hdev->wakeup || !hdev->wakeup(hdev)) { hdev->suspend_state = BT_SUSPEND_DISCONNECT; return 0; } /* Unpause to take care of updating scanning params */ hdev->scanning_paused = false; /* Enable event filter for paired devices */ hci_update_event_filter_sync(hdev); /* Update LE passive scan if enabled */ hci_update_passive_scan_sync(hdev); /* Pause scan changes again. */ hdev->scanning_paused = true; hdev->suspend_state = BT_SUSPEND_CONFIGURE_WAKE; return 0; } /* This function resumes discovery */ static int hci_resume_discovery_sync(struct hci_dev *hdev) { int err; /* If discovery not paused there nothing to do */ if (!hdev->discovery_paused) return 0; hdev->discovery_paused = false; hci_discovery_set_state(hdev, DISCOVERY_STARTING); err = hci_start_discovery_sync(hdev); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED : DISCOVERY_FINDING); return err; } static void hci_resume_monitor_sync(struct hci_dev *hdev) { switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_MSFT: msft_resume_sync(hdev); break; default: return; } } /* This function resume scan and reset paused flag */ static int hci_resume_scan_sync(struct hci_dev *hdev) { if (!hdev->scanning_paused) return 0; hdev->scanning_paused = false; hci_update_scan_sync(hdev); /* Reset passive scanning to normal */ hci_update_passive_scan_sync(hdev); return 0; } /* This function performs the HCI suspend procedures in the follow order: * * Restore event mask * Clear event filter * Update passive scanning (normal duty cycle) * Resume Directed Advertising/Advertising * Resume discovery (active scanning/inquiry) */ int hci_resume_sync(struct hci_dev *hdev) { /* If not marked as suspended there nothing to do */ if (!hdev->suspended) return 0; hdev->suspended = false; /* Restore event mask */ hci_set_event_mask_sync(hdev); /* Clear any event filters and restore scan state */ hci_clear_event_filter_sync(hdev); /* Resume scanning */ hci_resume_scan_sync(hdev); /* Resume monitor filters */ hci_resume_monitor_sync(hdev); /* Resume other advertisements */ hci_resume_advertising_sync(hdev); /* Resume discovery */ hci_resume_discovery_sync(hdev); return 0; } static bool conn_use_rpa(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; return hci_dev_test_flag(hdev, HCI_PRIVACY); } static int hci_le_ext_directed_advertising_sync(struct hci_dev *hdev, struct hci_conn *conn) { struct hci_cp_le_set_ext_adv_params cp; struct hci_rp_le_set_ext_adv_params rp; int err; bdaddr_t random_addr; u8 own_addr_type; err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (err) return err; /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ err = hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL, &own_addr_type, &random_addr); if (err) return err; memset(&cp, 0, sizeof(cp)); cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND); cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = HCI_TX_POWER_INVALID; cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_1M; cp.handle = 0x00; /* Use instance 0 for directed adv */ cp.own_addr_type = own_addr_type; cp.peer_addr_type = conn->dst_type; bacpy(&cp.peer_addr, &conn->dst); /* As per Core Spec 5.2 Vol 2, PART E, Sec 7.8.53, for * advertising_event_property LE_LEGACY_ADV_DIRECT_IND * does not supports advertising data when the advertising set already * contains some, the controller shall return erroc code 'Invalid * HCI Command Parameters(0x12). * So it is required to remove adv set for handle 0x00. since we use * instance 0 for directed adv. */ err = hci_remove_ext_adv_instance_sync(hdev, cp.handle, NULL); if (err) return err; err = hci_set_ext_adv_params_sync(hdev, NULL, &cp, &rp); if (err) return err; /* Update adv data as tx power is known now */ err = hci_set_ext_adv_data_sync(hdev, cp.handle); if (err) return err; /* Check if random address need to be updated */ if (own_addr_type == ADDR_LE_DEV_RANDOM && bacmp(&random_addr, BDADDR_ANY) && bacmp(&random_addr, &hdev->random_addr)) { err = hci_set_adv_set_random_addr_sync(hdev, 0x00, &random_addr); if (err) return err; } return hci_enable_ext_advertising_sync(hdev, 0x00); } static int hci_le_directed_advertising_sync(struct hci_dev *hdev, struct hci_conn *conn) { struct hci_cp_le_set_adv_param cp; u8 status; u8 own_addr_type; u8 enable; if (ext_adv_capable(hdev)) return hci_le_ext_directed_advertising_sync(hdev, conn); /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ hci_dev_clear_flag(hdev, HCI_LE_ADV); /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ status = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (status) return status; memset(&cp, 0, sizeof(cp)); /* Some controllers might reject command if intervals are not * within range for undirected advertising. * BCM20702A0 is known to be affected by this. */ cp.min_interval = cpu_to_le16(0x0020); cp.max_interval = cpu_to_le16(0x0020); cp.type = LE_ADV_DIRECT_IND; cp.own_address_type = own_addr_type; cp.direct_addr_type = conn->dst_type; bacpy(&cp.direct_addr, &conn->dst); cp.channel_map = hdev->le_adv_channel_map; status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (status) return status; enable = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static void set_ext_conn_params(struct hci_conn *conn, struct hci_cp_le_ext_conn_param *p) { struct hci_dev *hdev = conn->hdev; memset(p, 0, sizeof(*p)); p->scan_interval = cpu_to_le16(hdev->le_scan_int_connect); p->scan_window = cpu_to_le16(hdev->le_scan_window_connect); p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); p->conn_latency = cpu_to_le16(conn->le_conn_latency); p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout); p->min_ce_len = cpu_to_le16(0x0000); p->max_ce_len = cpu_to_le16(0x0000); } static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 own_addr_type) { struct hci_cp_le_ext_create_conn *cp; struct hci_cp_le_ext_conn_param *p; u8 data[sizeof(*cp) + sizeof(*p) * 3]; u32 plen; cp = (void *)data; p = (void *)cp->data; memset(cp, 0, sizeof(*cp)); bacpy(&cp->peer_addr, &conn->dst); cp->peer_addr_type = conn->dst_type; cp->own_addr_type = own_addr_type; plen = sizeof(*cp); if (scan_1m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_1M || conn->le_adv_sec_phy == HCI_ADV_PHY_1M)) { cp->phys |= LE_SCAN_PHY_1M; set_ext_conn_params(conn, p); p++; plen += sizeof(*p); } if (scan_2m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_2M || conn->le_adv_sec_phy == HCI_ADV_PHY_2M)) { cp->phys |= LE_SCAN_PHY_2M; set_ext_conn_params(conn, p); p++; plen += sizeof(*p); } if (scan_coded(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_CODED || conn->le_adv_sec_phy == HCI_ADV_PHY_CODED)) { cp->phys |= LE_SCAN_PHY_CODED; set_ext_conn_params(conn, p); plen += sizeof(*p); } return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN, plen, data, HCI_EV_LE_ENHANCED_CONN_COMPLETE, conn->conn_timeout, NULL); } static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data) { struct hci_cp_le_create_conn cp; struct hci_conn_params *params; u8 own_addr_type; int err; struct hci_conn *conn = data; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; bt_dev_dbg(hdev, "conn %p", conn); clear_bit(HCI_CONN_SCANNING, &conn->flags); conn->state = BT_CONNECT; /* If requested to connect as peripheral use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { /* If we're active scanning and simultaneous roles is not * enabled simply reject the attempt. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE && !hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) { hci_conn_del(conn); return -EBUSY; } /* Pause advertising while doing directed advertising. */ hci_pause_advertising_sync(hdev); err = hci_le_directed_advertising_sync(hdev, conn); goto done; } /* Disable advertising if simultaneous roles is not in use. */ if (!hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) hci_pause_advertising_sync(hdev); params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { conn->le_conn_min_interval = params->conn_min_interval; conn->le_conn_max_interval = params->conn_max_interval; conn->le_conn_latency = params->conn_latency; conn->le_supv_timeout = params->supervision_timeout; } else { conn->le_conn_min_interval = hdev->le_conn_min_interval; conn->le_conn_max_interval = hdev->le_conn_max_interval; conn->le_conn_latency = hdev->le_conn_latency; conn->le_supv_timeout = hdev->le_supv_timeout; } /* If controller is scanning, we stop it since some controllers are * not able to scan and connect at the same time. Also set the * HCI_LE_SCAN_INTERRUPTED flag so that the command complete * handler for scan disabling knows to set the correct discovery * state. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { hci_scan_disable_sync(hdev); hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); } /* Update random address, but set require_privacy to false so * that we never connect with an non-resolvable address. */ err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (err) goto done; /* Send command LE Extended Create Connection if supported */ if (use_ext_conn(hdev)) { err = hci_le_ext_create_conn_sync(hdev, conn, own_addr_type); goto done; } memset(&cp, 0, sizeof(cp)); cp.scan_interval = cpu_to_le16(hdev->le_scan_int_connect); cp.scan_window = cpu_to_le16(hdev->le_scan_window_connect); bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; cp.own_address_type = own_addr_type; cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); cp.conn_latency = cpu_to_le16(conn->le_conn_latency); cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261: * * If this event is unmasked and the HCI_LE_Connection_Complete event * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is * sent when a new connection has been created. */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp, use_enhanced_conn_complete(hdev) ? HCI_EV_LE_ENHANCED_CONN_COMPLETE : HCI_EV_LE_CONN_COMPLETE, conn->conn_timeout, NULL); done: if (err == -ETIMEDOUT) hci_le_connect_cancel_sync(hdev, conn, 0x00); /* Re-enable advertising after the connection attempt is finished. */ hci_resume_advertising_sync(hdev); return err; } int hci_le_create_cis_sync(struct hci_dev *hdev) { DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f); size_t aux_num_cis = 0; struct hci_conn *conn; u8 cig = BT_ISO_QOS_CIG_UNSET; /* The spec allows only one pending LE Create CIS command at a time. If * the command is pending now, don't do anything. We check for pending * connections after each CIS Established event. * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2566: * * If the Host issues this command before all the * HCI_LE_CIS_Established events from the previous use of the * command have been generated, the Controller shall return the * error code Command Disallowed (0x0C). * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2567: * * When the Controller receives the HCI_LE_Create_CIS command, the * Controller sends the HCI_Command_Status event to the Host. An * HCI_LE_CIS_Established event will be generated for each CIS when it * is established or if it is disconnected or considered lost before * being established; until all the events are generated, the command * remains pending. */ hci_dev_lock(hdev); rcu_read_lock(); /* Wait until previous Create CIS has completed */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) goto done; } /* Find CIG with all CIS ready */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { struct hci_conn *link; if (hci_conn_check_create_cis(conn)) continue; cig = conn->iso_qos.ucast.cig; list_for_each_entry_rcu(link, &hdev->conn_hash.list, list) { if (hci_conn_check_create_cis(link) > 0 && link->iso_qos.ucast.cig == cig && link->state != BT_CONNECTED) { cig = BT_ISO_QOS_CIG_UNSET; break; } } if (cig != BT_ISO_QOS_CIG_UNSET) break; } if (cig == BT_ISO_QOS_CIG_UNSET) goto done; list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { struct hci_cis *cis = &cmd->cis[aux_num_cis]; if (hci_conn_check_create_cis(conn) || conn->iso_qos.ucast.cig != cig) continue; set_bit(HCI_CONN_CREATE_CIS, &conn->flags); cis->acl_handle = cpu_to_le16(conn->parent->handle); cis->cis_handle = cpu_to_le16(conn->handle); aux_num_cis++; if (aux_num_cis >= cmd->num_cis) break; } cmd->num_cis = aux_num_cis; done: rcu_read_unlock(); hci_dev_unlock(hdev); if (!aux_num_cis) return 0; /* Wait for HCI_LE_CIS_Established */ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS, struct_size(cmd, cis, cmd->num_cis), cmd, HCI_EVT_LE_CIS_ESTABLISHED, conn->conn_timeout, NULL); } int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle) { struct hci_cp_le_remove_cig cp; memset(&cp, 0, sizeof(cp)); cp.cig_id = handle; return __hci_cmd_sync_status(hdev, HCI_OP_LE_REMOVE_CIG, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle) { struct hci_cp_le_big_term_sync cp; memset(&cp, 0, sizeof(cp)); cp.handle = handle; return __hci_cmd_sync_status(hdev, HCI_OP_LE_BIG_TERM_SYNC, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle) { struct hci_cp_le_pa_term_sync cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(handle); return __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bool use_rpa, struct adv_info *adv_instance, u8 *own_addr_type, bdaddr_t *rand_addr) { int err; bacpy(rand_addr, BDADDR_ANY); /* If privacy is enabled use a resolvable private address. If * current RPA has expired then generate a new one. */ if (use_rpa) { /* If Controller supports LL Privacy use own address type is * 0x03 */ if (ll_privacy_capable(hdev)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; if (adv_instance) { if (adv_rpa_valid(adv_instance)) return 0; } else { if (rpa_valid(hdev)) return 0; } err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { bt_dev_err(hdev, "failed to generate new RPA"); return err; } bacpy(rand_addr, &hdev->rpa); return 0; } /* In case of required privacy without resolvable private address, * use an non-resolvable private address. This is useful for * non-connectable advertising. */ if (require_privacy) { bdaddr_t nrpa; while (true) { /* The non-resolvable private address is generated * from random six bytes with the two most significant * bits cleared. */ get_random_bytes(&nrpa, 6); nrpa.b[5] &= 0x3f; /* The non-resolvable private address shall not be * equal to the public address. */ if (bacmp(&hdev->bdaddr, &nrpa)) break; } *own_addr_type = ADDR_LE_DEV_RANDOM; bacpy(rand_addr, &nrpa); return 0; } /* No privacy, use the current address */ hci_copy_identity_address(hdev, rand_addr, own_addr_type); return 0; } static int _update_adv_data_sync(struct hci_dev *hdev, void *data) { u8 instance = PTR_UINT(data); return hci_update_adv_data_sync(hdev, instance); } int hci_update_adv_data(struct hci_dev *hdev, u8 instance) { return hci_cmd_sync_queue(hdev, _update_adv_data_sync, UINT_PTR(instance), NULL); } static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; struct inquiry_entry *ie; struct hci_cp_create_conn cp; int err; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; /* Many controllers disallow HCI Create Connection while it is doing * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create * Connection. This may cause the MGMT discovering state to become false * without user space's request but it is okay since the MGMT Discovery * APIs do not promise that discovery should be done forever. Instead, * the user space monitors the status of MGMT discovering and it may * request for discovery again when this flag becomes false. */ if (test_bit(HCI_INQUIRY, &hdev->flags)) { err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); if (err) bt_dev_warn(hdev, "Failed to cancel inquiry %d", err); } conn->state = BT_CONNECT; conn->out = true; conn->role = HCI_ROLE_MASTER; conn->attempt++; conn->link_policy = hdev->link_policy; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.pscan_rep_mode = 0x02; ie = hci_inquiry_cache_lookup(hdev, &conn->dst); if (ie) { if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { cp.pscan_rep_mode = ie->data.pscan_rep_mode; cp.pscan_mode = ie->data.pscan_mode; cp.clock_offset = ie->data.clock_offset | cpu_to_le16(0x8000); } memcpy(conn->dev_class, ie->data.dev_class, 3); } cp.pkt_type = cpu_to_le16(conn->pkt_type); if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) cp.role_switch = 0x01; else cp.role_switch = 0x00; return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp, HCI_EV_CONN_COMPLETE, conn->conn_timeout, NULL); } int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn) { return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn, NULL); } static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; bt_dev_dbg(hdev, "err %d", err); if (err == -ECANCELED) return; hci_dev_lock(hdev); if (!hci_conn_valid(hdev, conn)) goto done; if (!err) { hci_connect_le_scan_cleanup(conn, 0x00); goto done; } /* Check if connection is still pending */ if (conn != hci_lookup_le_connect(hdev)) goto done; /* Flush to make sure we send create conn cancel command if needed */ flush_delayed_work(&conn->le_conn_timeout); hci_conn_failed(conn, bt_status(err)); done: hci_dev_unlock(hdev); } int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn) { return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn, create_le_conn_complete); } int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn) { if (conn->state != BT_OPEN) return -EINVAL; switch (conn->type) { case ACL_LINK: return !hci_cmd_sync_dequeue_once(hdev, hci_acl_create_conn_sync, conn, NULL); case LE_LINK: return !hci_cmd_sync_dequeue_once(hdev, hci_le_create_conn_sync, conn, create_le_conn_complete); } return -ENOENT; } int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, struct hci_conn_params *params) { struct hci_cp_le_conn_update cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.conn_interval_min = cpu_to_le16(params->conn_min_interval); cp.conn_interval_max = cpu_to_le16(params->conn_max_interval); cp.conn_latency = cpu_to_le16(params->conn_latency); cp.supervision_timeout = cpu_to_le16(params->supervision_timeout); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); return __hci_cmd_sync_status(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static void create_pa_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; struct hci_conn *pa_sync; bt_dev_dbg(hdev, "err %d", err); if (err == -ECANCELED) return; hci_dev_lock(hdev); if (!hci_conn_valid(hdev, conn)) clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); if (!err) goto unlock; /* Add connection to indicate PA sync error */ pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) goto unlock; set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags); /* Notify iso layer */ hci_connect_cfm(pa_sync, bt_status(err)); unlock: hci_dev_unlock(hdev); } static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) { struct hci_cp_le_pa_create_sync cp; struct hci_conn *conn = data; struct bt_iso_qos *qos = &conn->iso_qos; int err; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; if (conn->sync_handle != HCI_SYNC_HANDLE_INVALID) return -EINVAL; if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) return -EBUSY; /* Stop scanning if SID has not been set and active scanning is enabled * so we use passive scanning which will be scanning using the allow * list programmed to contain only the connection address. */ if (conn->sid == HCI_SID_INVALID && hci_dev_test_flag(hdev, HCI_LE_SCAN)) { hci_scan_disable_sync(hdev); hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); hci_discovery_set_state(hdev, DISCOVERY_STOPPED); } /* Mark HCI_CONN_CREATE_PA_SYNC so hci_update_passive_scan_sync can * program the address in the allow list so PA advertisements can be * received. */ set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); hci_update_passive_scan_sync(hdev); /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update * it. */ if (conn->sid == HCI_SID_INVALID) { err = __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, HCI_EV_LE_EXT_ADV_REPORT, conn->conn_timeout, NULL); if (err == -ETIMEDOUT) goto done; } memset(&cp, 0, sizeof(cp)); cp.options = qos->bcast.options; cp.sid = conn->sid; cp.addr_type = conn->dst_type; bacpy(&cp.addr, &conn->dst); cp.skip = cpu_to_le16(qos->bcast.skip); cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); cp.sync_cte_type = qos->bcast.sync_cte_type; /* The spec allows only one pending LE Periodic Advertising Create * Sync command at a time so we forcefully wait for PA Sync Established * event since cmd_work can only schedule one command at a time. * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2493: * * If the Host issues this command when another HCI_LE_Periodic_ * Advertising_Create_Sync command is pending, the Controller shall * return the error code Command Disallowed (0x0C). */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_PA_CREATE_SYNC, sizeof(cp), &cp, HCI_EV_LE_PA_SYNC_ESTABLISHED, conn->conn_timeout, NULL); if (err == -ETIMEDOUT) __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); done: hci_dev_clear_flag(hdev, HCI_PA_SYNC); /* Update passive scan since HCI_PA_SYNC flag has been cleared */ hci_update_passive_scan_sync(hdev); return err; } int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn) { return hci_cmd_sync_queue_once(hdev, hci_le_pa_create_sync, conn, create_pa_complete); } static void create_big_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; bt_dev_dbg(hdev, "err %d", err); if (err == -ECANCELED) return; if (hci_conn_valid(hdev, conn)) clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); } static int hci_le_big_create_sync(struct hci_dev *hdev, void *data) { DEFINE_FLEX(struct hci_cp_le_big_create_sync, cp, bis, num_bis, 0x11); struct hci_conn *conn = data; struct bt_iso_qos *qos = &conn->iso_qos; int err; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); memset(cp, 0, sizeof(*cp)); cp->handle = qos->bcast.big; cp->sync_handle = cpu_to_le16(conn->sync_handle); cp->encryption = qos->bcast.encryption; memcpy(cp->bcode, qos->bcast.bcode, sizeof(cp->bcode)); cp->mse = qos->bcast.mse; cp->timeout = cpu_to_le16(qos->bcast.timeout); cp->num_bis = conn->num_bis; memcpy(cp->bis, conn->bis, conn->num_bis); /* The spec allows only one pending LE BIG Create Sync command at * a time, so we forcefully wait for BIG Sync Established event since * cmd_work can only schedule one command at a time. * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2586: * * If the Host sends this command when the Controller is in the * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_ * Established event has not been generated, the Controller shall * return the error code Command Disallowed (0x0C). */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_BIG_CREATE_SYNC, struct_size(cp, bis, cp->num_bis), cp, HCI_EVT_LE_BIG_SYNC_ESTABLISHED, conn->conn_timeout, NULL); if (err == -ETIMEDOUT) hci_le_big_terminate_sync(hdev, cp->handle); return err; } int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn) { return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn, create_big_complete); }
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 // SPDX-License-Identifier: GPL-2.0-or-later /* Keyring handling * * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/err.h> #include <linux/user_namespace.h> #include <linux/nsproxy.h> #include <keys/keyring-type.h> #include <keys/user-type.h> #include <linux/assoc_array_priv.h> #include <linux/uaccess.h> #include <net/net_namespace.h> #include "internal.h" /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. */ #define KEYRING_SEARCH_MAX_DEPTH 6 /* * We mark pointers we pass to the associative array with bit 1 set if * they're keyrings and clear otherwise. */ #define KEYRING_PTR_SUBTYPE 0x2UL static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) { return (unsigned long)x & KEYRING_PTR_SUBTYPE; } static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) { void *object = assoc_array_ptr_to_leaf(x); return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); } static inline void *keyring_key_to_ptr(struct key *key) { if (key->type == &key_type_keyring) return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); return key; } static DEFINE_RWLOCK(keyring_name_lock); /* * Clean up the bits of user_namespace that belong to us. */ void key_free_user_ns(struct user_namespace *ns) { write_lock(&keyring_name_lock); list_del_init(&ns->keyring_name_list); write_unlock(&keyring_name_lock); key_put(ns->user_keyring_register); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif } /* * The keyring key type definition. Keyrings are simply keys of this type and * can be treated as ordinary keys in addition to having their own special * operations. */ static int keyring_preparse(struct key_preparsed_payload *prep); static void keyring_free_preparse(struct key_preparsed_payload *prep); static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); static long keyring_read(const struct key *keyring, char *buffer, size_t buflen); struct key_type key_type_keyring = { .name = "keyring", .def_datalen = 0, .preparse = keyring_preparse, .free_preparse = keyring_free_preparse, .instantiate = keyring_instantiate, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, .read = keyring_read, }; EXPORT_SYMBOL(key_type_keyring); /* * Semaphore to serialise link/link calls to prevent two link calls in parallel * introducing a cycle. */ static DEFINE_MUTEX(keyring_serialise_link_lock); /* * Publish the name of a keyring so that it can be found by name (if it has * one and it doesn't begin with a dot). */ static void keyring_publish_name(struct key *keyring) { struct user_namespace *ns = current_user_ns(); if (keyring->description && keyring->description[0] && keyring->description[0] != '.') { write_lock(&keyring_name_lock); list_add_tail(&keyring->name_link, &ns->keyring_name_list); write_unlock(&keyring_name_lock); } } /* * Preparse a keyring payload */ static int keyring_preparse(struct key_preparsed_payload *prep) { return prep->datalen != 0 ? -EINVAL : 0; } /* * Free a preparse of a user defined key payload */ static void keyring_free_preparse(struct key_preparsed_payload *prep) { } /* * Initialise a keyring. * * Returns 0 on success, -EINVAL if given any data. */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep) { assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); return 0; } /* * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd * fold the carry back too, but that requires inline asm. */ static u64 mult_64x32_and_fold(u64 x, u32 y) { u64 hi = (u64)(u32)(x >> 32) * y; u64 lo = (u64)(u32)(x) * y; return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); } /* * Hash a key type and description. */ static void hash_key_type_and_desc(struct keyring_index_key *index_key) { const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK; const char *description = index_key->description; unsigned long hash, type; u32 piece; u64 acc; int n, desc_len = index_key->desc_len; type = (unsigned long)index_key->type; acc = mult_64x32_and_fold(type, desc_len + 13); acc = mult_64x32_and_fold(acc, 9207); piece = (unsigned long)index_key->domain_tag; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); for (;;) { n = desc_len; if (n <= 0) break; if (n > 4) n = 4; piece = 0; memcpy(&piece, description, n); description += n; desc_len -= n; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); } /* Fold the hash down to 32 bits if need be. */ hash = acc; if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) hash ^= acc >> 32; /* Squidge all the keyrings into a separate part of the tree to * ordinary keys by making sure the lowest level segment in the hash is * zero for keyrings and non-zero otherwise. */ if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0) hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0) hash = (hash + (hash << level_shift)) & ~fan_mask; index_key->hash = hash; } /* * Finalise an index key to include a part of the description actually in the * index key, to set the domain tag and to calculate the hash. */ void key_set_index_key(struct keyring_index_key *index_key) { static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), }; size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc)); memcpy(index_key->desc, index_key->description, n); if (!index_key->domain_tag) { if (index_key->type->flags & KEY_TYPE_NET_DOMAIN) index_key->domain_tag = current->nsproxy->net_ns->key_domain; else index_key->domain_tag = &default_domain_tag; } hash_key_type_and_desc(index_key); } /** * key_put_tag - Release a ref on a tag. * @tag: The tag to release. * * This releases a reference the given tag and returns true if that ref was the * last one. */ bool key_put_tag(struct key_tag *tag) { if (refcount_dec_and_test(&tag->usage)) { kfree_rcu(tag, rcu); return true; } return false; } /** * key_remove_domain - Kill off a key domain and gc its keys * @domain_tag: The domain tag to release. * * This marks a domain tag as being dead and releases a ref on it. If that * wasn't the last reference, the garbage collector is poked to try and delete * all keys that were in the domain. */ void key_remove_domain(struct key_tag *domain_tag) { domain_tag->removed = true; if (!key_put_tag(domain_tag)) key_schedule_gc_links(); } /* * Build the next index key chunk. * * We return it one word-sized chunk at a time. */ static unsigned long keyring_get_key_chunk(const void *data, int level) { const struct keyring_index_key *index_key = data; unsigned long chunk = 0; const u8 *d; int desc_len = index_key->desc_len, n = sizeof(chunk); level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; switch (level) { case 0: return index_key->hash; case 1: return index_key->x; case 2: return (unsigned long)index_key->type; case 3: return (unsigned long)index_key->domain_tag; default: level -= 4; if (desc_len <= sizeof(index_key->desc)) return 0; d = index_key->description + sizeof(index_key->desc); d += level * sizeof(long); desc_len -= sizeof(index_key->desc); if (desc_len > n) desc_len = n; do { chunk <<= 8; chunk |= *d++; } while (--desc_len > 0); return chunk; } } static unsigned long keyring_get_object_key_chunk(const void *object, int level) { const struct key *key = keyring_ptr_to_key(object); return keyring_get_key_chunk(&key->index_key, level); } static bool keyring_compare_object(const void *object, const void *data) { const struct keyring_index_key *index_key = data; const struct key *key = keyring_ptr_to_key(object); return key->index_key.type == index_key->type && key->index_key.domain_tag == index_key->domain_tag && key->index_key.desc_len == index_key->desc_len && memcmp(key->index_key.description, index_key->description, index_key->desc_len) == 0; } /* * Compare the index keys of a pair of objects and determine the bit position * at which they differ - if they differ. */ static int keyring_diff_objects(const void *object, const void *data) { const struct key *key_a = keyring_ptr_to_key(object); const struct keyring_index_key *a = &key_a->index_key; const struct keyring_index_key *b = data; unsigned long seg_a, seg_b; int level, i; level = 0; seg_a = a->hash; seg_b = b->hash; if ((seg_a ^ seg_b) != 0) goto differ; level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; /* The number of bits contributed by the hash is controlled by a * constant in the assoc_array headers. Everything else thereafter we * can deal with as being machine word-size dependent. */ seg_a = a->x; seg_b = b->x; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); /* The next bit may not work on big endian */ seg_a = (unsigned long)a->type; seg_b = (unsigned long)b->type; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); seg_a = (unsigned long)a->domain_tag; seg_b = (unsigned long)b->domain_tag; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); i = sizeof(a->desc); if (a->desc_len <= i) goto same; for (; i < a->desc_len; i++) { seg_a = *(unsigned char *)(a->description + i); seg_b = *(unsigned char *)(b->description + i); if ((seg_a ^ seg_b) != 0) goto differ_plus_i; } same: return -1; differ_plus_i: level += i; differ: i = level * 8 + __ffs(seg_a ^ seg_b); return i; } /* * Free an object after stripping the keyring flag off of the pointer. */ static void keyring_free_object(void *object) { key_put(keyring_ptr_to_key(object)); } /* * Operations for keyring management by the index-tree routines. */ static const struct assoc_array_ops keyring_assoc_array_ops = { .get_key_chunk = keyring_get_key_chunk, .get_object_key_chunk = keyring_get_object_key_chunk, .compare_object = keyring_compare_object, .diff_objects = keyring_diff_objects, .free_object = keyring_free_object, }; /* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. * * The garbage collector detects the final key_put(), removes the keyring from * the serial number tree and then does RCU synchronisation before coming here, * so we shouldn't need to worry about code poking around here with the RCU * readlock held by this time. */ static void keyring_destroy(struct key *keyring) { if (keyring->description) { write_lock(&keyring_name_lock); if (keyring->name_link.next != NULL && !list_empty(&keyring->name_link)) list_del(&keyring->name_link); write_unlock(&keyring_name_lock); } if (keyring->restrict_link) { struct key_restriction *keyres = keyring->restrict_link; key_put(keyres->key); kfree(keyres); } assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* * Describe a keyring for /proc. */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); } } struct keyring_read_iterator_context { size_t buflen; size_t count; key_serial_t *buffer; }; static int keyring_read_iterator(const void *object, void *data) { struct keyring_read_iterator_context *ctx = data; const struct key *key = keyring_ptr_to_key(object); kenter("{%s,%d},,{%zu/%zu}", key->type->name, key->serial, ctx->count, ctx->buflen); if (ctx->count >= ctx->buflen) return 1; *ctx->buffer++ = key->serial; ctx->count += sizeof(key->serial); return 0; } /* * Read a list of key IDs from the keyring's contents in binary form * * The keyring's semaphore is read-locked by the caller. This prevents someone * from modifying it under us - which could cause us to read key IDs multiple * times. */ static long keyring_read(const struct key *keyring, char *buffer, size_t buflen) { struct keyring_read_iterator_context ctx; long ret; kenter("{%d},,%zu", key_serial(keyring), buflen); if (buflen & (sizeof(key_serial_t) - 1)) return -EINVAL; /* Copy as many key IDs as fit into the buffer */ if (buffer && buflen) { ctx.buffer = (key_serial_t *)buffer; ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { kleave(" = %ld [iterate]", ret); return ret; } } /* Return the size of the buffer needed */ ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t); if (ret <= buflen) kleave("= %ld [ok]", ret); else kleave("= %ld [buffer too small]", ret); return ret; } /* * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, uid, gid, cred, perm, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { key_put(keyring); keyring = ERR_PTR(ret); } } return keyring; } EXPORT_SYMBOL(keyring_alloc); /** * restrict_link_reject - Give -EPERM to restrict link * @keyring: The keyring being added to. * @type: The type of key being added. * @payload: The payload of the key intended to be added. * @restriction_key: Keys providing additional data for evaluating restriction. * * Reject the addition of any links to a keyring. It can be overridden by * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when * adding a key to a keyring. * * This is meant to be stored in a key_restriction structure which is passed * in the restrict_link parameter to keyring_alloc(). */ int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return -EPERM; } /* * By default, we keys found by getting an exact match on their descriptions. */ bool key_default_cmp(const struct key *key, const struct key_match_data *match_data) { return strcmp(key->description, match_data->raw_data) == 0; } /* * Iteration function to consider each key found. */ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); unsigned long kflags = READ_ONCE(key->flags); short state = READ_ONCE(key->state); kenter("{%d}", key->serial); /* ignore keys not of this type */ if (key->type != ctx->index_key.type) { kleave(" = 0 [!type]"); return 0; } /* skip invalidated, revoked and expired keys */ if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { time64_t expiry = READ_ONCE(key->expiry); if (kflags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { ctx->result = ERR_PTR(-EKEYREVOKED); kleave(" = %d [invrev]", ctx->skipped_ret); goto skipped; } if (expiry && ctx->now >= expiry) { if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED)) ctx->result = ERR_PTR(-EKEYEXPIRED); kleave(" = %d [expire]", ctx->skipped_ret); goto skipped; } } /* keys that don't match */ if (!ctx->match_data.cmp(key, &ctx->match_data)) { kleave(" = 0 [!match]"); return 0; } /* key must have search permissions */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) { ctx->result = ERR_PTR(-EACCES); kleave(" = %d [!perm]", ctx->skipped_ret); goto skipped; } if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ if (state < 0) { ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } } /* Found */ ctx->result = make_key_ref(key, ctx->possessed); kleave(" = 1 [found]"); return 1; skipped: return ctx->skipped_ret; } /* * Search inside a keyring for a key. We can search by walking to it * directly based on its index-key or we can iterate over the entire * tree looking for it, based on the match function. */ static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) { if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) { const void *object; object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, &ctx->index_key); return object ? ctx->iterator(object, ctx) : 0; } return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); } /* * Search a tree of keyrings that point to other keyrings up to the maximum * depth. */ static bool search_nested_keyrings(struct key *keyring, struct keyring_search_context *ctx) { struct { struct key *keyring; struct assoc_array_node *node; int slot; } stack[KEYRING_SEARCH_MAX_DEPTH]; struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; struct key *key; int sp = 0, slot; kenter("{%d},{%s,%s}", keyring->serial, ctx->index_key.type->name, ctx->index_key.description); #define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK) BUG_ON((ctx->flags & STATE_CHECKS) == 0 || (ctx->flags & STATE_CHECKS) == STATE_CHECKS); if (ctx->index_key.description) key_set_index_key(&ctx->index_key); /* Check to see if this top-level keyring is what we are looking for * and whether it is valid or not. */ if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE || keyring_compare_object(keyring, &ctx->index_key)) { ctx->skipped_ret = 2; switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { case 1: goto found; case 2: return false; default: break; } } ctx->skipped_ret = 0; /* Start processing a new keyring */ descend_to_keyring: kdebug("descend to %d", keyring->serial); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto not_this_keyring; /* Search through the keys in this keyring before its searching its * subtrees. */ if (search_keyring(keyring, ctx)) goto found; /* Then manually iterate through the keyrings nested in this one. * * Start from the root node of the index tree. Because of the way the * hash function has been set up, keyrings cluster on the leftmost * branch of the root node (root slot 0) or in the root node itself. * Non-keyrings avoid the leftmost branch of the root entirely (root * slots 1-15). */ if (!(ctx->flags & KEYRING_SEARCH_RECURSE)) goto not_this_keyring; ptr = READ_ONCE(keyring->keys.root); if (!ptr) goto not_this_keyring; if (assoc_array_ptr_is_shortcut(ptr)) { /* If the root is a shortcut, either the keyring only contains * keyring pointers (everything clusters behind root slot 0) or * doesn't contain any keyring pointers. */ shortcut = assoc_array_ptr_to_shortcut(ptr); if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) goto not_this_keyring; ptr = READ_ONCE(shortcut->next_node); node = assoc_array_ptr_to_node(ptr); goto begin_node; } node = assoc_array_ptr_to_node(ptr); ptr = node->slots[0]; if (!assoc_array_ptr_is_meta(ptr)) goto begin_node; descend_to_node: /* Descend to a more distal node in this keyring's content tree and go * through that. */ kdebug("descend"); if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->next_node); BUG_ON(!assoc_array_ptr_is_node(ptr)); } node = assoc_array_ptr_to_node(ptr); begin_node: kdebug("begin_node"); slot = 0; ascend_to_node: /* Go through the slots in a node */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); if (assoc_array_ptr_is_meta(ptr)) { if (node->back_pointer || assoc_array_ptr_is_shortcut(ptr)) goto descend_to_node; } if (!keyring_ptr_is_keyring(ptr)) continue; key = keyring_ptr_to_key(ptr); if (sp >= KEYRING_SEARCH_MAX_DEPTH) { if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { ctx->result = ERR_PTR(-ELOOP); return false; } goto not_this_keyring; } /* Search a nested keyring */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; stack[sp].node = node; stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; goto descend_to_keyring; } /* We've dealt with all the slots in the current node, so now we need * to ascend to the parent and continue processing there. */ ptr = READ_ONCE(node->back_pointer); slot = node->parent_slot; if (ptr && assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; } if (!ptr) goto not_this_keyring; node = assoc_array_ptr_to_node(ptr); slot++; /* If we've ascended to the root (zero backpointer), we must have just * finished processing the leftmost branch rather than the root slots - * so there can't be any more keyrings for us to find. */ if (node->back_pointer) { kdebug("ascend %d", slot); goto ascend_to_node; } /* The keyring we're looking at was disqualified or didn't contain a * matching key. */ not_this_keyring: kdebug("not_this_keyring %d", sp); if (sp <= 0) { kleave(" = false"); return false; } /* Resume the processing of a keyring higher up in the tree */ sp--; keyring = stack[sp].keyring; node = stack[sp].node; slot = stack[sp].slot + 1; kdebug("ascend to %d [%d]", keyring->serial, slot); goto ascend_to_node; /* We found a viable match */ found: key = key_ref_to_ptr(ctx->result); key_check(key); if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { key->last_used_at = ctx->now; keyring->last_used_at = ctx->now; while (sp > 0) stack[--sp].keyring->last_used_at = ctx->now; } kleave(" = true"); return true; } /** * keyring_search_rcu - Search a keyring tree for a matching key under RCU * @keyring_ref: A pointer to the keyring with possession indicator. * @ctx: The keyring search context. * * Search the supplied keyring tree for a key that matches the criteria given. * The root keyring and any linked keyrings must grant Search permission to the * caller to be searchable and keys can only be found if they too grant Search * to the caller. The possession flag on the root keyring pointer controls use * of the possessor bits in permissions checking of the entire tree. In * addition, the LSM gets to forbid keyring searches and key matches. * * The search is performed as a breadth-then-depth search up to the prescribed * limit (KEYRING_SEARCH_MAX_DEPTH). The caller must hold the RCU read lock to * prevent keyrings from being destroyed or rearranged whilst they are being * searched. * * Keys are matched to the type provided and are then filtered by the match * function, which is given the description to use in any way it sees fit. The * match function may use any attributes of a key that it wishes to * determine the match. Normally the match function from the key type would be * used. * * RCU can be used to prevent the keyring key lists from disappearing without * the need to take lots of locks. * * Returns a pointer to the found key and increments the key usage count if * successful; -EAGAIN if no matching keys were found, or if expired or revoked * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the * specified keyring wasn't a keyring. * * In the case of a successful return, the possession attribute from * @keyring_ref is propagated to the returned key reference. */ key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx) { struct key *keyring; long err; ctx->iterator = keyring_search_iterator; ctx->possessed = is_key_possessed(keyring_ref); ctx->result = ERR_PTR(-EAGAIN); keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return ERR_PTR(-ENOTDIR); if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH); if (err < 0) return ERR_PTR(err); } ctx->now = ktime_get_real_seconds(); if (search_nested_keyrings(keyring, ctx)) __key_get(key_ref_to_ptr(ctx->result)); return ctx->result; } /** * keyring_search - Search the supplied keyring tree for a matching key * @keyring: The root of the keyring tree to be searched. * @type: The type of keyring we want to find. * @description: The name of the keyring we want to find. * @recurse: True to search the children of @keyring also * * As keyring_search_rcu() above, but using the current task's credentials and * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; key_ref_t key; int ret; if (recurse) ctx.flags |= KEYRING_SEARCH_RECURSE; if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) return ERR_PTR(ret); } rcu_read_lock(); key = keyring_search_rcu(keyring, &ctx); rcu_read_unlock(); if (type->match_free) type->match_free(&ctx.match_data); return key; } EXPORT_SYMBOL(keyring_search); static struct key_restriction *keyring_restriction_alloc( key_restrict_link_func_t check) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; return keyres; } /* * Semaphore to serialise restriction setup to prevent reference count * cycles through restriction key pointers. */ static DECLARE_RWSEM(keyring_serialise_restrict_sem); /* * Check for restriction cycles that would prevent keyring garbage collection. * keyring_serialise_restrict_sem must be held. */ static bool keyring_detect_restriction_cycle(const struct key *dest_keyring, struct key_restriction *keyres) { while (keyres && keyres->key && keyres->key->type == &key_type_keyring) { if (keyres->key == dest_keyring) return true; keyres = keyres->key->restrict_link; } return false; } /** * keyring_restrict - Look up and apply a restriction to a keyring * @keyring_ref: The keyring to be restricted * @type: The key type that will provide the restriction checker. * @restriction: The restriction options to apply to the keyring * * Look up a keyring and apply a restriction to it. The restriction is managed * by the specific key type, but can be configured by the options specified in * the restriction string. */ int keyring_restrict(key_ref_t keyring_ref, const char *type, const char *restriction) { struct key *keyring; struct key_type *restrict_type = NULL; struct key_restriction *restrict_link; int ret = 0; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return -ENOTDIR; if (!type) { restrict_link = keyring_restriction_alloc(restrict_link_reject); } else { restrict_type = key_type_lookup(type); if (IS_ERR(restrict_type)) return PTR_ERR(restrict_type); if (!restrict_type->lookup_restriction) { ret = -ENOENT; goto error; } restrict_link = restrict_type->lookup_restriction(restriction); } if (IS_ERR(restrict_link)) { ret = PTR_ERR(restrict_link); goto error; } down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); if (keyring->restrict_link) { ret = -EEXIST; } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; } else { keyring->restrict_link = restrict_link; notify_key(keyring, NOTIFY_KEY_SETATTR, 0); } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); if (ret < 0) { key_put(restrict_link->key); kfree(restrict_link); } error: if (restrict_type) key_type_put(restrict_type); return ret; } EXPORT_SYMBOL(keyring_restrict); /* * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the * permission is granted to modify the keyring as no check is made here. The * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if * successful and returns NULL if not found. Revoked and invalidated keys are * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key) { struct key *keyring, *key; const void *object; keyring = key_ref_to_ptr(keyring_ref); kenter("{%d},{%s,%s}", keyring->serial, index_key->type->name, index_key->description); object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, index_key); if (object) goto found; kleave(" = NULL"); return NULL; found: key = keyring_ptr_to_key(object); if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { kleave(" = NULL [x]"); return NULL; } __key_get(key); kleave(" = {%d}", key->serial); return make_key_ref(key, is_key_possessed(keyring_ref)); } /* * Find a keyring with the specified name. * * Only keyrings that have nonzero refcount, are not revoked, and are owned by a * user in the current user namespace are considered. If @uid_keyring is %true, * the keyring additionally must have been allocated as a user or user session * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct user_namespace *ns = current_user_ns(); struct key *keyring; if (!name) return ERR_PTR(-EINVAL); read_lock(&keyring_name_lock); /* Search this hash bucket for a keyring with a matching name that * grants Search permission and that hasn't been revoked */ list_for_each_entry(keyring, &ns->keyring_name_list, name_link) { if (!kuid_has_mapping(ns, keyring->user->uid)) continue; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) continue; if (strcmp(keyring->description, name) != 0) continue; if (uid_keyring) { if (!test_bit(KEY_FLAG_UID_KEYRING, &keyring->flags)) continue; } else { if (key_permission(make_key_ref(keyring, 0), KEY_NEED_SEARCH) < 0) continue; } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' * (ie. it has a zero usage count) */ if (!refcount_inc_not_zero(&keyring->usage)) continue; keyring->last_used_at = ktime_get_real_seconds(); goto out; } keyring = ERR_PTR(-ENOKEY); out: read_unlock(&keyring_name_lock); return keyring; } static int keyring_detect_cycle_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); kenter("{%d}", key->serial); /* We might get a keyring with matching index-key that is nonetheless a * different keyring. */ if (key != ctx->match_data.raw_data) return 0; ctx->result = ERR_PTR(-EDEADLK); return 1; } /* * See if a cycle will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). * * Since we are adding B to A at the top level, checking for cycles should just * be a matter of seeing if node A is somewhere in tree B. */ static int keyring_detect_cycle(struct key *A, struct key *B) { struct keyring_search_context ctx = { .index_key = A->index_key, .match_data.raw_data = A, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .iterator = keyring_detect_cycle_iterator, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_NO_UPDATE_TIME | KEYRING_SEARCH_NO_CHECK_PERM | KEYRING_SEARCH_DETECT_TOO_DEEP | KEYRING_SEARCH_RECURSE), }; rcu_read_lock(); search_nested_keyrings(B, &ctx); rcu_read_unlock(); return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Lock keyring for link. */ int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_lock) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Lock keyrings for move (link/unlink combination). */ int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key) __acquires(&l_keyring->sem) __acquires(&u_keyring->sem) __acquires(&keyring_serialise_link_lock) { if (l_keyring->type != &key_type_keyring || u_keyring->type != &key_type_keyring) return -ENOTDIR; /* We have to be very careful here to take the keyring locks in the * right order, lest we open ourselves to deadlocking against another * move operation. */ if (l_keyring < u_keyring) { down_write(&l_keyring->sem); down_write_nested(&u_keyring->sem, 1); } else { down_write(&u_keyring->sem); down_write_nested(&l_keyring->sem, 1); } /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Preallocate memory so that a key can be linked into to a keyring. */ int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; int ret; kenter("%d,%s,%s,", keyring->serial, index_key->type->name, index_key->description); BUG_ON(index_key->desc_len == 0); BUG_ON(*_edit != NULL); *_edit = NULL; ret = -EKEYREVOKED; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) goto error; /* Create an edit script that will insert/replace the key in the * keyring tree. */ edit = assoc_array_insert(&keyring->keys, &keyring_assoc_array_ops, index_key, NULL); if (IS_ERR(edit)) { ret = PTR_ERR(edit); goto error; } /* If we're not replacing a link in-place then we're going to need some * extra quota. */ if (!edit->dead_leaf) { ret = key_payload_reserve(keyring, keyring->datalen + KEYQUOTA_LINK_BYTES); if (ret < 0) goto error_cancel; } *_edit = edit; kleave(" = 0"); return 0; error_cancel: assoc_array_cancel_edit(edit); error: kleave(" = %d", ret); return ret; } /* * Check already instantiated keys aren't going to be a problem. * * The caller must have called __key_link_begin(). Don't need to call this for * keys that were created since __key_link_begin() was called. */ int __key_link_check_live_key(struct key *keyring, struct key *key) { if (key->type == &key_type_keyring) /* check that we aren't going to create a cycle by linking one * keyring to another */ return keyring_detect_cycle(keyring, key); return 0; } /* * Link a key into to a keyring. * * Must be called with __key_link_begin() having being called. Discards any * already extant link to matching key if there is one, so that each keyring * holds at most one link to any given key of a particular type+description * combination. */ void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* * Finish linking a key into to a keyring. * * Must be called with __key_link_begin() having being called. */ void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_lock) { BUG_ON(index_key->type == NULL); kenter("%d,%s,", keyring->serial, index_key->type->name); if (edit) { if (!edit->dead_leaf) { key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } assoc_array_cancel_edit(edit); } up_write(&keyring->sem); if (index_key->type == &key_type_keyring) mutex_unlock(&keyring_serialise_link_lock); } /* * Check addition of keys to restricted keyrings. */ static int __key_link_check_restriction(struct key *keyring, struct key *key) { if (!keyring->restrict_link || !keyring->restrict_link->check) return 0; return keyring->restrict_link->check(keyring, key->type, &key->payload, keyring->restrict_link->key); } /** * key_link - Link a key to a keyring * @keyring: The keyring to make the link in. * @key: The key to link to. * * Make a link in a keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring. * * This function will write-lock the keyring's semaphore and will consume some * of the user's key data quota to hold the link. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is * full, -EDQUOT if there is insufficient key data quota remaining to add * another link or -ENOMEM if there's insufficient memory. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_link(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage)); key_check(keyring); key_check(key); ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_end; kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage)); ret = __key_link_check_restriction(keyring, key); if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); error: kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); /* * Lock a keyring for unlink. */ static int __key_unlink_lock(struct key *keyring) __acquires(&keyring->sem) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); return 0; } /* * Begin the process of unlinking a key from a keyring. */ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) return PTR_ERR(edit); if (!edit) return -ENOENT; *_edit = edit; return 0; } /* * Apply an unlink change. */ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } /* * Finish unlinking a key from to a keyring. */ static void __key_unlink_end(struct key *keyring, struct key *key, struct assoc_array_edit *edit) __releases(&keyring->sem) { if (edit) assoc_array_cancel_edit(edit); up_write(&keyring->sem); } /** * key_unlink - Unlink the first link to a key from a keyring. * @keyring: The keyring to remove the link from. * @key: The key the link is to. * * Remove a link from a keyring to a key. * * This function will write-lock the keyring's semaphore. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if * the key isn't linked to by the keyring or -ENOMEM if there's insufficient * memory. * * It is assumed that the caller has checked that it is permitted for a link to * be removed (the keyring should have Write permission; no permissions are * required on the key). */ int key_unlink(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; key_check(keyring); key_check(key); ret = __key_unlink_lock(keyring); if (ret < 0) return ret; ret = __key_unlink_begin(keyring, key, &edit); if (ret == 0) __key_unlink(keyring, key, &edit); __key_unlink_end(keyring, key, edit); return ret; } EXPORT_SYMBOL(key_unlink); /** * key_move - Move a key from one keyring to another * @key: The key to move * @from_keyring: The keyring to remove the link from. * @to_keyring: The keyring to make the link in. * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL. * * Make a link in @to_keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring * whilst simultaneously removing a link to the key from @from_keyring. * * This function will write-lock both keyring's semaphores and will consume * some of the user's key data quota to hold the link on @to_keyring. * * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring, * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second * keyring is full, -EDQUOT if there is insufficient key data quota remaining * to add another link or -ENOMEM if there's insufficient memory. If * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a * matching key in @to_keyring. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags) { struct assoc_array_edit *from_edit = NULL, *to_edit = NULL; int ret; kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial); if (from_keyring == to_keyring) return 0; key_check(key); key_check(from_keyring); key_check(to_keyring); ret = __key_move_lock(from_keyring, to_keyring, &key->index_key); if (ret < 0) goto out; ret = __key_unlink_begin(from_keyring, key, &from_edit); if (ret < 0) goto error; ret = __key_link_begin(to_keyring, &key->index_key, &to_edit); if (ret < 0) goto error; ret = -EEXIST; if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL)) goto error; ret = __key_link_check_restriction(to_keyring, key); if (ret < 0) goto error; ret = __key_link_check_live_key(to_keyring, key); if (ret < 0) goto error; __key_unlink(from_keyring, key, &from_edit); __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); out: kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(key_move); /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. * * Clear the contents of the specified keyring. * * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. */ int keyring_clear(struct key *keyring) { struct assoc_array_edit *edit; int ret; if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (IS_ERR(edit)) { ret = PTR_ERR(edit); } else { if (edit) assoc_array_apply_edit(edit); notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } up_write(&keyring->sem); return ret; } EXPORT_SYMBOL(keyring_clear); /* * Dispose of the links from a revoked keyring. * * This is called with the key sem write-locked. */ static void keyring_revoke(struct key *keyring) { struct assoc_array_edit *edit; edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (!IS_ERR(edit)) { if (edit) assoc_array_apply_edit(edit); key_payload_reserve(keyring, 0); } } static bool keyring_gc_select_iterator(void *object, void *iterator_data) { struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; if (key_is_dead(key, *limit)) return false; key_get(key); return true; } static int keyring_gc_check_iterator(const void *object, void *iterator_data) { const struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; key_check(key); return key_is_dead(key, *limit); } /* * Garbage collect pointers from a keyring. * * Not called with any locks held. The keyring's key struct will not be * deallocated under us as only our caller may deallocate it. */ void keyring_gc(struct key *keyring, time64_t limit) { int result; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto dont_gc; /* scan the keyring looking for dead keys */ rcu_read_lock(); result = assoc_array_iterate(&keyring->keys, keyring_gc_check_iterator, &limit); rcu_read_unlock(); if (result == true) goto do_gc; dont_gc: kleave(" [no gc]"); return; do_gc: down_write(&keyring->sem); assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops, keyring_gc_select_iterator, &limit); up_write(&keyring->sem); kleave(" [gc]"); } /* * Garbage collect restriction pointers from a keyring. * * Keyring restrictions are associated with a key type, and must be cleaned * up if the key type is unregistered. The restriction is altered to always * reject additional keys so a keyring cannot be opened up by unregistering * a key type. * * Not called with any keyring locks held. The keyring's key struct will not * be deallocated under us as only our caller may deallocate it. * * The caller is required to hold key_types_sem and dead_type->sem. This is * fulfilled by key_gc_keytype() holding the locks on behalf of * key_garbage_collector(), which it invokes on a workqueue. */ void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type) { struct key_restriction *keyres; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); /* * keyring->restrict_link is only assigned at key allocation time * or with the key type locked, so the only values that could be * concurrently assigned to keyring->restrict_link are for key * types other than dead_type. Given this, it's ok to check * the key type before acquiring keyring->sem. */ if (!dead_type || !keyring->restrict_link || keyring->restrict_link->keytype != dead_type) { kleave(" [no restriction gc]"); return; } /* Lock the keyring to ensure that a link is not in progress */ down_write(&keyring->sem); keyres = keyring->restrict_link; keyres->check = restrict_link_reject; key_put(keyres->key); keyres->key = NULL; keyres->keytype = NULL; up_write(&keyring->sem); kleave(" [restriction gc]"); }
10 10 11 2 13 13 13 13 13 14 1 1 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <rdma/rdma_netlink.h> #include <net/addrconf.h> #include "rxe.h" #include "rxe_loc.h" MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); MODULE_DESCRIPTION("Soft RDMA transport"); MODULE_LICENSE("Dual BSD/GPL"); /* free resources for a rxe device all objects created for this device must * have been destroyed */ void rxe_dealloc(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); rxe_pool_cleanup(&rxe->uc_pool); rxe_pool_cleanup(&rxe->pd_pool); rxe_pool_cleanup(&rxe->ah_pool); rxe_pool_cleanup(&rxe->srq_pool); rxe_pool_cleanup(&rxe->qp_pool); rxe_pool_cleanup(&rxe->cq_pool); rxe_pool_cleanup(&rxe->mr_pool); rxe_pool_cleanup(&rxe->mw_pool); WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree)); mutex_destroy(&rxe->usdev_lock); } static const struct ib_device_ops rxe_ib_dev_odp_ops = { .advise_mr = rxe_ib_advise_mr, }; /* initialize rxe device parameters */ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) { rxe->max_inline_data = RXE_MAX_INLINE_DATA; rxe->attr.vendor_id = RXE_VENDOR_ID; rxe->attr.max_mr_size = RXE_MAX_MR_SIZE; rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP; rxe->attr.max_qp = RXE_MAX_QP; rxe->attr.max_qp_wr = RXE_MAX_QP_WR; rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; rxe->attr.kernel_cap_flags = IBK_ALLOW_USER_UNREG; rxe->attr.max_send_sge = RXE_MAX_SGE; rxe->attr.max_recv_sge = RXE_MAX_SGE; rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; rxe->attr.max_cq = RXE_MAX_CQ; rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1; rxe->attr.max_mr = RXE_MAX_MR; rxe->attr.max_mw = RXE_MAX_MW; rxe->attr.max_pd = RXE_MAX_PD; rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM; rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; rxe->attr.atomic_cap = IB_ATOMIC_HCA; rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP; rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH; rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH; rxe->attr.max_ah = RXE_MAX_AH; rxe->attr.max_srq = RXE_MAX_SRQ; rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR; rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE; rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; if (ndev->addr_len) { memcpy(rxe->raw_gid, ndev->dev_addr, min_t(unsigned int, ndev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(rxe->raw_gid); } addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, rxe->raw_gid); rxe->max_ucontext = RXE_MAX_UCONTEXT; if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING; /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; /* set handler for ODP prefetching API - ibv_advise_mr(3) */ ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops); } } /* initialize port attributes */ static void rxe_init_port_param(struct rxe_port *port) { port->attr.state = IB_PORT_DOWN; port->attr.max_mtu = IB_MTU_4096; port->attr.active_mtu = IB_MTU_256; port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR; port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR; port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN; port->attr.lid = RXE_PORT_LID; port->attr.sm_lid = RXE_PORT_SM_LID; port->attr.lmc = RXE_PORT_LMC; port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM; port->attr.sm_sl = RXE_PORT_SM_SL; port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT; port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY; port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; port->attr.phys_state = RXE_PORT_PHYS_STATE; port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256); port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); } /* initialize port state, note IB convention that HCA ports are always * numbered from 1 */ static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev) { struct rxe_port *port = &rxe->port; rxe_init_port_param(port); addrconf_addr_eui48((unsigned char *)&port->port_guid, rxe->raw_gid); spin_lock_init(&port->port_lock); } /* init pools of managed objects */ static void rxe_init_pools(struct rxe_dev *rxe) { rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC); rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD); rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH); rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ); rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP); rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ); rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR); rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW); } /* initialize rxe device state */ static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev) { /* init default device parameters */ rxe_init_device_param(rxe, ndev); rxe_init_ports(rxe, ndev); rxe_init_pools(rxe); /* init pending mmap list */ spin_lock_init(&rxe->mmap_offset_lock); spin_lock_init(&rxe->pending_lock); INIT_LIST_HEAD(&rxe->pending_mmaps); /* init multicast support */ spin_lock_init(&rxe->mcg_lock); rxe->mcg_tree = RB_ROOT; mutex_init(&rxe->usdev_lock); } void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) { struct rxe_port *port = &rxe->port; enum ib_mtu mtu; mtu = eth_mtu_int_to_enum(ndev_mtu); /* Make sure that new MTU in range */ mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256; port->attr.active_mtu = mtu; port->mtu_cap = ib_mtu_enum_to_int(mtu); } /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, struct net_device *ndev) { rxe_init(rxe, ndev); rxe_set_mtu(rxe, mtu); return rxe_register_device(rxe, ibdev_name, ndev); } static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) { struct rxe_dev *rxe; int err = 0; if (is_vlan_dev(ndev)) { rxe_err("rxe creation allowed on top of a real device only\n"); err = -EPERM; goto err; } rxe = rxe_get_dev_from_net(ndev); if (rxe) { ib_device_put(&rxe->ib_dev); rxe_err_dev(rxe, "already configured on %s\n", ndev->name); err = -EEXIST; goto err; } err = rxe_net_add(ibdev_name, ndev); if (err) { rxe_err("failed to add %s\n", ndev->name); goto err; } err: return err; } static struct rdma_link_ops rxe_link_ops = { .type = "rxe", .newlink = rxe_newlink, }; static int __init rxe_module_init(void) { int err; err = rxe_alloc_wq(); if (err) return err; err = rxe_net_init(); if (err) { rxe_destroy_wq(); return err; } rdma_link_register(&rxe_link_ops); pr_info("loaded\n"); return 0; } static void __exit rxe_module_exit(void) { rdma_link_unregister(&rxe_link_ops); ib_unregister_driver(RDMA_DRIVER_RXE); rxe_net_exit(); rxe_destroy_wq(); pr_info("unloaded\n"); } late_initcall(rxe_module_init); module_exit(rxe_module_exit); MODULE_ALIAS_RDMA_LINK("rxe");
8 8 8 8 8 8 17 1 16 10 6 1 6 1 6 8 7 1 8 8 8 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC recvmsg() implementation * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" /* * Post a call for attention by the socket or kernel service. Further * notifications are suppressed by putting recvmsg_link on a dummy queue. */ void rxrpc_notify_socket(struct rxrpc_call *call) { struct rxrpc_sock *rx; struct sock *sk; _enter("%d", call->debug_id); if (!list_empty(&call->recvmsg_link)) return; if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { rxrpc_see_call(call, rxrpc_call_see_notify_released); return; } rcu_read_lock(); rx = rcu_dereference(call->socket); sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { spin_lock_irq(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); spin_unlock_irq(&call->notify_lock); } else { spin_lock_irq(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } spin_unlock_irq(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); sk->sk_data_ready(sk); } } } rcu_read_unlock(); _leave(""); } /* * Pass a call terminating message to userspace. */ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) { u32 tmp = 0; int ret; switch (call->completion) { case RXRPC_CALL_SUCCEEDED: ret = 0; if (rxrpc_is_service_call(call)) ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp); break; case RXRPC_CALL_REMOTELY_ABORTED: tmp = call->abort_code; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); break; case RXRPC_CALL_LOCALLY_ABORTED: tmp = call->abort_code; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); break; case RXRPC_CALL_NETWORK_ERROR: tmp = -call->error; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp); break; case RXRPC_CALL_LOCAL_ERROR: tmp = -call->error; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp); break; default: pr_err("Invalid terminal call state %u\n", call->completion); BUG(); break; } trace_rxrpc_recvdata(call, rxrpc_recvmsg_terminal, call->ackr_window - 1, call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } /* * Discard a packet we've used up and advance the Rx window by one. */ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_serial_t serial; rxrpc_seq_t old_consumed = call->rx_consumed, tseq; bool last; int acked; _enter("%d", call->debug_id); skb = skb_dequeue(&call->recvmsg_queue); rxrpc_see_skb(skb, rxrpc_skb_see_rotate); sp = rxrpc_skb(skb); tseq = sp->hdr.seq; serial = sp->hdr.serial; last = sp->hdr.flags & RXRPC_LAST_PACKET; /* Barrier against rxrpc_input_data(). */ if (after(tseq, call->rx_consumed)) smp_store_release(&call->rx_consumed, tseq); rxrpc_free_skb(skb, rxrpc_skb_put_rotate); trace_rxrpc_receive(call, last ? rxrpc_receive_rotate_last : rxrpc_receive_rotate, serial, call->rx_consumed); if (last) set_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags); /* Check to see if there's an ACK that needs sending. */ acked = atomic_add_return(call->rx_consumed - old_consumed, &call->ackr_nr_consumed); if (acked > 8 && !test_and_set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_poke_call(call, rxrpc_call_poke_idle); } /* * Decrypt and verify a DATA packet. */ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); if (sp->flags & RXRPC_RX_VERIFIED) return 0; return call->security->verify_packet(call, skb); } /* * Transcribe a call's user ID to a control message. */ static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg, int flags) { if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) return 0; if (flags & MSG_CMSG_COMPAT) { unsigned int id32 = call->user_call_ID; return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, sizeof(unsigned int), &id32); } else { unsigned long idl = call->user_call_ID; return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, sizeof(unsigned long), &idl); } } /* * Deal with a CHALLENGE packet. */ static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg, struct sk_buff *challenge, unsigned int flags) { struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); struct rxrpc_connection *conn = sp->chall.conn; return conn->security->challenge_to_recvmsg(conn, challenge, msg); } /* * Process OOB packets. Called with the socket locked. */ static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg, unsigned int flags) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct sk_buff *skb; bool need_response = false; int ret; skb = skb_peek(&rx->recvmsg_oobq); if (!skb) return -EAGAIN; rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64), &skb->skb_mstamp_ns); if (ret < 0) return ret; switch ((enum rxrpc_oob_type)skb->mark) { case RXRPC_OOB_CHALLENGE: need_response = true; ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags); break; default: WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n", skb->mark); ret = -EIO; break; } if (!(flags & MSG_PEEK)) skb_unlink(skb, &rx->recvmsg_oobq); if (need_response) rxrpc_add_pending_oob(rx, skb); else rxrpc_free_skb(skb, rxrpc_skb_put_oob); return ret; } /* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA * (returns 1). If more packets are required, it returns -EAGAIN and if the * call has failed it returns -EIO. */ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, struct iov_iter *iter, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct sk_buff *skb; rxrpc_seq_t seq = 0; size_t remain; unsigned int rx_pkt_offset, rx_pkt_len; int copy, ret = -EAGAIN, ret2; rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; if (rxrpc_call_has_failed(call)) { seq = call->ackr_window - 1; ret = -EIO; goto done; } if (test_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags)) { seq = call->ackr_window - 1; ret = 1; goto done; } /* No one else can be removing stuff from the queue, so we shouldn't * need the Rx lock to walk it. */ skb = skb_peek(&call->recvmsg_queue); while (skb) { rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); sp = rxrpc_skb(skb); seq = sp->hdr.seq; if (!(flags & MSG_PEEK)) trace_rxrpc_receive(call, rxrpc_receive_front, sp->hdr.serial, seq); if (msg) sock_recv_timestamp(msg, sock->sk, skb); if (rx_pkt_offset == 0) { ret2 = rxrpc_verify_data(call, skb); trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, sp->offset, sp->len, ret2); if (ret2 < 0) { ret = ret2; goto out; } rx_pkt_offset = sp->offset; rx_pkt_len = sp->len; } else { trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); } /* We have to handle short, empty and used-up DATA packets. */ remain = len - *_offset; copy = rx_pkt_len; if (copy > remain) copy = remain; if (copy > 0) { ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, copy); if (ret2 < 0) { ret = ret2; goto out; } /* handle piecemeal consumption of data packets */ rx_pkt_offset += copy; rx_pkt_len -= copy; *_offset += copy; } if (rx_pkt_len > 0) { trace_rxrpc_recvdata(call, rxrpc_recvmsg_full, seq, rx_pkt_offset, rx_pkt_len, 0); ASSERTCMP(*_offset, ==, len); ret = 0; break; } /* The whole packet has been transferred. */ if (sp->hdr.flags & RXRPC_LAST_PACKET) ret = 1; rx_pkt_offset = 0; rx_pkt_len = 0; skb = skb_peek_next(skb, &call->recvmsg_queue); if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); if (!rx->app_ops && !skb_queue_empty_lockless(&rx->recvmsg_oobq)) { trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq, rx_pkt_offset, rx_pkt_len, ret); break; } } out: if (!(flags & MSG_PEEK)) { call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); if (ret == -EAGAIN) set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); return ret; } /* * Receive a message from an RxRPC socket * - we need to be careful about two or more threads calling recvmsg * simultaneously */ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct list_head *l; unsigned int call_debug_id = 0; size_t copied = 0; long timeo; int ret; DEFINE_WAIT(wait); trace_rxrpc_recvmsg(0, rxrpc_recvmsg_enter, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); try_again: lock_sock(&rx->sk); /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && skb_queue_empty_lockless(&rx->recvmsg_oobq) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; } if (list_empty(&rx->recvmsg_q)) { ret = -EWOULDBLOCK; if (timeo == 0) { call = NULL; goto error_no_call; } release_sock(&rx->sk); /* Wait for something to happen */ prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, TASK_INTERRUPTIBLE); ret = sock_error(&rx->sk); if (ret) goto wait_error; if (list_empty(&rx->recvmsg_q) && skb_queue_empty_lockless(&rx->recvmsg_oobq)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); goto try_again; } /* Deal with OOB messages before we consider getting normal data. */ if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) { ret = rxrpc_recvmsg_oob(sock, msg, flags); release_sock(&rx->sk); if (ret == -EAGAIN) goto try_again; goto error_no_call; } /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. * We also want to weed out calls that got requeued whilst we were * shovelling data out. */ spin_lock_irq(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && skb_queue_empty(&call->recvmsg_queue) && skb_queue_empty(&rx->recvmsg_oobq)) { list_del_init(&call->recvmsg_link); spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); rxrpc_put_call(call, rxrpc_call_put_recvmsg); goto try_again; } rxrpc_see_call(call, rxrpc_call_see_recvmsg); if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { rxrpc_see_call(call, rxrpc_call_see_already_released); list_del_init(&call->recvmsg_link); spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); rxrpc_put_call(call, rxrpc_call_put_recvmsg); goto try_again; } if (!(flags & MSG_PEEK)) list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); spin_unlock_irq(&rx->recvmsg_lock); call_debug_id = call->debug_id; trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); /* We're going to drop the socket lock, so we need to lock the call * against interference by sendmsg. */ if (!mutex_trylock(&call->user_mutex)) { ret = -EWOULDBLOCK; if (flags & MSG_DONTWAIT) goto error_requeue_call; ret = -ERESTARTSYS; if (mutex_lock_interruptible(&call->user_mutex) < 0) goto error_requeue_call; } release_sock(&rx->sk); if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { rxrpc_see_call(call, rxrpc_call_see_already_released); mutex_unlock(&call->user_mutex); if (!(flags & MSG_PEEK)) rxrpc_put_call(call, rxrpc_call_put_recvmsg); goto try_again; } ret = rxrpc_recvmsg_user_id(call, msg, flags); if (ret < 0) goto error_unlock_call; if (msg->msg_name && call->peer) { size_t len = sizeof(call->dest_srx); memcpy(msg->msg_name, &call->dest_srx, len); msg->msg_namelen = len; } ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, flags, &copied); if (ret == -EAGAIN) ret = 0; if (ret == -EIO) goto call_failed; if (ret < 0) goto error_unlock_call; if (rxrpc_call_is_complete(call) && skb_queue_empty(&call->recvmsg_queue)) goto call_complete; if (rxrpc_call_has_failed(call)) goto call_failed; if (!skb_queue_empty(&call->recvmsg_queue)) rxrpc_notify_socket(call); goto not_yet_complete; call_failed: rxrpc_purge_queue(&call->recvmsg_queue); call_complete: ret = rxrpc_recvmsg_term(call, msg); if (ret < 0) goto error_unlock_call; if (!(flags & MSG_PEEK)) rxrpc_release_call(rx, call); msg->msg_flags |= MSG_EOR; ret = 1; not_yet_complete: if (ret == 0) msg->msg_flags |= MSG_MORE; else msg->msg_flags &= ~MSG_MORE; ret = copied; error_unlock_call: mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_recvmsg); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; error_requeue_call: if (!(flags & MSG_PEEK)) { spin_lock_irq(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); spin_unlock_irq(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); } error_no_call: release_sock(&rx->sk); error_trace: trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; wait_interrupted: ret = sock_intr_errno(timeo); wait_error: finish_wait(sk_sleep(&rx->sk), &wait); call = NULL; goto error_trace; } /** * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info * @sock: The socket that the call exists on * @call: The call to send data through * @iter: The buffer to receive into * @_len: The amount of data we want to receive (decreased on return) * @want_more: True if more data is expected to be read * @_abort: Where the abort code is stored if -ECONNABORTED is returned * @_service: Where to store the actual service ID (may be upgraded) * * Allow a kernel service to receive data and pick up information about the * state of a call. Note that *@_abort should also be initialised to %0. * * Note that we may return %-EAGAIN to drain empty packets at the end * of the data, even if we've already copied over the requested data. * * Return: %0 if got what was asked for and there's more available, %1 * if we got what was asked for and we're at the end of the data and * %-EAGAIN if we need more data. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, struct iov_iter *iter, size_t *_len, bool want_more, u32 *_abort, u16 *_service) { size_t offset = 0; int ret; _enter("{%d},%zu,%d", call->debug_id, *_len, want_more); mutex_lock(&call->user_mutex); ret = rxrpc_recvmsg_data(sock, call, NULL, iter, *_len, 0, &offset); *_len -= offset; if (ret == -EIO) goto call_failed; if (ret < 0) goto out; /* We can only reach here with a partially full buffer if we have * reached the end of the data. We must otherwise have a full buffer * or have been given -EAGAIN. */ if (ret == 1) { if (iov_iter_count(iter) > 0) goto short_data; if (!want_more) goto read_phase_complete; ret = 0; goto out; } if (!want_more) goto excess_data; goto out; read_phase_complete: ret = 1; out: if (_service) *_service = call->dest_srx.srx_service; mutex_unlock(&call->user_mutex); _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort); return ret; short_data: trace_rxrpc_abort(call->debug_id, rxrpc_recvmsg_short_data, call->cid, call->call_id, call->rx_consumed, 0, -EBADMSG); ret = -EBADMSG; goto out; excess_data: trace_rxrpc_abort(call->debug_id, rxrpc_recvmsg_excess_data, call->cid, call->call_id, call->rx_consumed, 0, -EMSGSIZE); ret = -EMSGSIZE; goto out; call_failed: *_abort = call->abort_code; ret = call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { ret = 1; if (iov_iter_count(iter) > 0) ret = -ECONNRESET; } goto out; } EXPORT_SYMBOL(rxrpc_kernel_recv_data);
1 9 8 9 1 1 1 1 1 1 1 2 2 2 2 1 1 1 3 3 3 3 30 29 1 1 3 3 29 31 31 32 13 13 16 16 4 16 16 16 15 1 8 8 8 2 8 8 5 3 3 4 4 4 4 4 4 4 4 1 1 1 1 1 1 3 3 1 5 8 8 8 3 5 5 156 157 155 2 149 1 1 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright (C) 2011 ProFUSION Embedded Systems Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI core. */ #include <linux/export.h> #include <linux/rfkill.h> #include <linux/debugfs.h> #include <linux/crypto.h> #include <linux/kcov.h> #include <linux/property.h> #include <linux/suspend.h> #include <linux/wait.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "hci_debugfs.h" #include "smp.h" #include "leds.h" #include "msft.h" #include "aosp.h" #include "hci_codec.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); static void hci_tx_work(struct work_struct *work); /* HCI device list */ LIST_HEAD(hci_dev_list); DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ static struct hci_dev *__hci_dev_get(int index, int *srcu_index) { struct hci_dev *hdev = NULL, *d; BT_DBG("%d", index); if (index < 0) return NULL; read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); if (srcu_index) *srcu_index = srcu_read_lock(&d->srcu); break; } } read_unlock(&hci_dev_list_lock); return hdev; } struct hci_dev *hci_dev_get(int index) { return __hci_dev_get(index, NULL); } static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index) { return __hci_dev_get(index, srcu_index); } static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index) { srcu_read_unlock(&hdev->srcu, srcu_index); hci_dev_put(hdev); } /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) { struct discovery_state *discov = &hdev->discovery; switch (discov->state) { case DISCOVERY_FINDING: case DISCOVERY_RESOLVING: return true; default: return false; } } void hci_discovery_set_state(struct hci_dev *hdev, int state) { int old_state = hdev->discovery.state; if (old_state == state) return; hdev->discovery.state = state; switch (state) { case DISCOVERY_STOPPED: hci_update_passive_scan(hdev); if (old_state != DISCOVERY_STARTING) mgmt_discovering(hdev, 0); break; case DISCOVERY_STARTING: break; case DISCOVERY_FINDING: mgmt_discovering(hdev, 1); break; case DISCOVERY_RESOLVING: break; case DISCOVERY_STOPPING: break; } bt_dev_dbg(hdev, "state %u -> %u", old_state, state); } void hci_inquiry_cache_flush(struct hci_dev *hdev) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *p, *n; list_for_each_entry_safe(p, n, &cache->all, all) { list_del(&p->all); kfree(p); } INIT_LIST_HEAD(&cache->unknown); INIT_LIST_HEAD(&cache->resolve); } struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->all, all) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->unknown, list) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev, bdaddr_t *bdaddr, int state) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p bdaddr %pMR state %d", cache, bdaddr, state); list_for_each_entry(e, &cache->resolve, list) { if (!bacmp(bdaddr, BDADDR_ANY) && e->name_state == state) return e; if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } void hci_inquiry_cache_update_resolve(struct hci_dev *hdev, struct inquiry_entry *ie) { struct discovery_state *cache = &hdev->discovery; struct list_head *pos = &cache->resolve; struct inquiry_entry *p; list_del(&ie->list); list_for_each_entry(p, &cache->resolve, list) { if (p->name_state != NAME_PENDING && abs(p->data.rssi) >= abs(ie->data.rssi)) break; pos = &p->list; } list_add(&ie->list, pos); } u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, bool name_known) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *ie; u32 flags = 0; BT_DBG("cache %p, %pMR", cache, &data->bdaddr); hci_remove_remote_oob_data(hdev, &data->bdaddr, BDADDR_BREDR); if (!data->ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr); if (ie) { if (!ie->data.ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; if (ie->name_state == NAME_NEEDED && data->rssi != ie->data.rssi) { ie->data.rssi = data->rssi; hci_inquiry_cache_update_resolve(hdev, ie); } goto update; } /* Entry not in the cache. Add new one. */ ie = kzalloc(sizeof(*ie), GFP_KERNEL); if (!ie) { flags |= MGMT_DEV_FOUND_CONFIRM_NAME; goto done; } list_add(&ie->all, &cache->all); if (name_known) { ie->name_state = NAME_KNOWN; } else { ie->name_state = NAME_NOT_KNOWN; list_add(&ie->list, &cache->unknown); } update: if (name_known && ie->name_state != NAME_KNOWN && ie->name_state != NAME_PENDING) { ie->name_state = NAME_KNOWN; list_del(&ie->list); } memcpy(&ie->data, data, sizeof(*data)); ie->timestamp = jiffies; cache->timestamp = jiffies; if (ie->name_state == NAME_NOT_KNOWN) flags |= MGMT_DEV_FOUND_CONFIRM_NAME; done: return flags; } static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) { struct discovery_state *cache = &hdev->discovery; struct inquiry_info *info = (struct inquiry_info *) buf; struct inquiry_entry *e; int copied = 0; list_for_each_entry(e, &cache->all, all) { struct inquiry_data *data = &e->data; if (copied >= num) break; bacpy(&info->bdaddr, &data->bdaddr); info->pscan_rep_mode = data->pscan_rep_mode; info->pscan_period_mode = data->pscan_period_mode; info->pscan_mode = data->pscan_mode; memcpy(info->dev_class, data->dev_class, 3); info->clock_offset = data->clock_offset; info++; copied++; } BT_DBG("cache %p, copied %d", cache, copied); return copied; } int hci_inquiry(void __user *arg) { __u8 __user *ptr = arg; struct hci_inquiry_req ir; struct hci_dev *hdev; int err = 0, do_inquiry = 0, max_rsp; __u8 *buf; if (copy_from_user(&ir, ptr, sizeof(ir))) return -EFAULT; hdev = hci_dev_get(ir.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } /* Restrict maximum inquiry length to 60 seconds */ if (ir.length > 60) { err = -EINVAL; goto done; } hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { hci_inquiry_cache_flush(hdev); do_inquiry = 1; } hci_dev_unlock(hdev); if (do_inquiry) { hci_req_sync_lock(hdev); err = hci_inquiry_sync(hdev, ir.length, ir.num_rsp); hci_req_sync_unlock(hdev); if (err < 0) goto done; /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is * cleared). If it is interrupted by a signal, return -EINTR. */ if (wait_on_bit(&hdev->flags, HCI_INQUIRY, TASK_INTERRUPTIBLE)) { err = -EINTR; goto done; } } /* for unlimited number of responses we will use buffer with * 255 entries */ max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp; /* cache_dump can't sleep. Therefore we allocate temp buffer and then * copy it to the user space. */ buf = kmalloc_array(max_rsp, sizeof(struct inquiry_info), GFP_KERNEL); if (!buf) { err = -ENOMEM; goto done; } hci_dev_lock(hdev); ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf); hci_dev_unlock(hdev); BT_DBG("num_rsp %d", ir.num_rsp); if (!copy_to_user(ptr, &ir, sizeof(ir))) { ptr += sizeof(ir); if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) * ir.num_rsp)) err = -EFAULT; } else err = -EFAULT; kfree(buf); done: hci_dev_put(hdev); return err; } static int hci_dev_do_open(struct hci_dev *hdev) { int ret = 0; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); ret = hci_dev_open_sync(hdev); hci_req_sync_unlock(hdev); return ret; } /* ---- HCI ioctl helpers ---- */ int hci_dev_open(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; /* Devices that are marked as unconfigured can only be powered * up as user channel. Trying to bring them up as normal devices * will result into a failure. Only user channel operation is * possible. * * When this function is called for a user channel, the flag * HCI_USER_CHANNEL will be set first before attempting to * open the device. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EOPNOTSUPP; goto done; } /* We need to ensure that no other power on/off work is pending * before proceeding to call hci_dev_do_open. This is * particularly important if the setup procedure has not yet * completed. */ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); /* After this call it is guaranteed that the setup procedure * has finished. This means that error conditions like RFKILL * or no valid public or static random address apply. */ flush_workqueue(hdev->req_workqueue); /* For controllers not using the management interface and that * are brought up using legacy ioctl, set the HCI_BONDABLE bit * so that pairing works for them. Once the management interface * is in use this bit will be cleared again and userspace has * to explicitly enable it. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !hci_dev_test_flag(hdev, HCI_MGMT)) hci_dev_set_flag(hdev, HCI_BONDABLE); err = hci_dev_do_open(hdev); done: hci_dev_put(hdev); return err; } int hci_dev_do_close(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_dev_close_sync(hdev); hci_req_sync_unlock(hdev); return err; } int hci_dev_close(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } cancel_work_sync(&hdev->power_on); if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); err = hci_dev_do_close(hdev); done: hci_dev_put(hdev); return err; } static int hci_dev_do_reset(struct hci_dev *hdev) { int ret; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); /* Cancel these to avoid queueing non-chained pending work */ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); /* Wait for * * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) * queue_delayed_work(&hdev->{cmd,ncmd}_timer) * * inside RCU section to see the flag or complete scheduling. */ synchronize_rcu(); /* Explicitly cancel works in case scheduled after setting the flag. */ cancel_delayed_work(&hdev->cmd_timer); cancel_delayed_work(&hdev->ncmd_timer); /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ drain_workqueue(hdev->workqueue); hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_conn_hash_flush(hdev); hci_dev_unlock(hdev); if (hdev->flush) hdev->flush(hdev); hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); atomic_set(&hdev->cmd_cnt, 1); hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; hdev->iso_cnt = 0; ret = hci_reset_sync(hdev); hci_req_sync_unlock(hdev); return ret; } int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; int err, srcu_index; hdev = hci_dev_get_srcu(dev, &srcu_index); if (!hdev) return -ENODEV; if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; goto done; } if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } err = hci_dev_do_reset(hdev); done: hci_dev_put_srcu(hdev, srcu_index); return err; } int hci_dev_reset_stat(__u16 dev) { struct hci_dev *hdev; int ret = 0; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { ret = -EOPNOTSUPP; goto done; } memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); done: hci_dev_put(hdev); return ret; } static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan) { bool conn_changed, discov_changed; BT_DBG("%s scan 0x%02x", hdev->name, scan); if ((scan & SCAN_PAGE)) conn_changed = !hci_dev_test_and_set_flag(hdev, HCI_CONNECTABLE); else conn_changed = hci_dev_test_and_clear_flag(hdev, HCI_CONNECTABLE); if ((scan & SCAN_INQUIRY)) { discov_changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE); } else { hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); discov_changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE); } if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; if (conn_changed || discov_changed) { /* In case this was disabled through mgmt */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) hci_update_adv_data(hdev, hdev->cur_adv_instance); mgmt_new_settings(hdev); } } int hci_dev_cmd(unsigned int cmd, void __user *arg) { struct hci_dev *hdev; struct hci_dev_req dr; __le16 policy; int err = 0; if (copy_from_user(&dr, arg, sizeof(dr))) return -EFAULT; hdev = hci_dev_get(dr.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } switch (cmd) { case HCISETAUTH: err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETENCRYPT: if (!lmp_encrypt_capable(hdev)) { err = -EOPNOTSUPP; break; } if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); if (err) break; } err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_ENCRYPT_MODE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETSCAN: err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); /* Ensure that the connectable and discoverable states * get correctly modified as this was a non-mgmt change. */ if (!err) hci_update_passive_scan_state(hdev, dr.dev_opt); break; case HCISETLINKPOL: policy = cpu_to_le16(dr.dev_opt); err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy, HCI_CMD_TIMEOUT); break; case HCISETLINKMODE: hdev->link_mode = ((__u16) dr.dev_opt) & (HCI_LM_MASTER | HCI_LM_ACCEPT); break; case HCISETPTYPE: if (hdev->pkt_type == (__u16) dr.dev_opt) break; hdev->pkt_type = (__u16) dr.dev_opt; mgmt_phy_configuration_changed(hdev, NULL); break; case HCISETACLMTU: hdev->acl_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0); break; case HCISETSCOMTU: hdev->sco_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0); break; default: err = -EINVAL; break; } done: hci_dev_put(hdev); return err; } int hci_get_dev_list(void __user *arg) { struct hci_dev *hdev; struct hci_dev_list_req *dl; struct hci_dev_req *dr; int n = 0, err; __u16 dev_num; if (get_user(dev_num, (__u16 __user *) arg)) return -EFAULT; if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr)) return -EINVAL; dl = kzalloc(struct_size(dl, dev_req, dev_num), GFP_KERNEL); if (!dl) return -ENOMEM; dl->dev_num = dev_num; dr = dl->dev_req; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { unsigned long flags = hdev->flags; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags &= ~BIT(HCI_UP); dr[n].dev_id = hdev->id; dr[n].dev_opt = flags; if (++n >= dev_num) break; } read_unlock(&hci_dev_list_lock); dl->dev_num = n; err = copy_to_user(arg, dl, struct_size(dl, dev_req, n)); kfree(dl); return err ? -EFAULT : 0; } int hci_get_dev_info(void __user *arg) { struct hci_dev *hdev; struct hci_dev_info di; unsigned long flags; int err = 0; if (copy_from_user(&di, arg, sizeof(di))) return -EFAULT; hdev = hci_dev_get(di.dev_id); if (!hdev) return -ENODEV; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags = hdev->flags & ~BIT(HCI_UP); else flags = hdev->flags; strscpy(di.name, hdev->name, sizeof(di.name)); di.bdaddr = hdev->bdaddr; di.type = (hdev->bus & 0x0f); di.flags = flags; di.pkt_type = hdev->pkt_type; if (lmp_bredr_capable(hdev)) { di.acl_mtu = hdev->acl_mtu; di.acl_pkts = hdev->acl_pkts; di.sco_mtu = hdev->sco_mtu; di.sco_pkts = hdev->sco_pkts; } else { di.acl_mtu = hdev->le_mtu; di.acl_pkts = hdev->le_pkts; di.sco_mtu = 0; di.sco_pkts = 0; } di.link_policy = hdev->link_policy; di.link_mode = hdev->link_mode; memcpy(&di.stat, &hdev->stat, sizeof(di.stat)); memcpy(&di.features, &hdev->features, sizeof(di.features)); if (copy_to_user(arg, &di, sizeof(di))) err = -EFAULT; hci_dev_put(hdev); return err; } /* ---- Interface to HCI drivers ---- */ static int hci_dev_do_poweroff(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_set_powered_sync(hdev, false); hci_req_sync_unlock(hdev); return err; } static int hci_rfkill_set_block(void *data, bool blocked) { struct hci_dev *hdev = data; int err; BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED)) return 0; if (blocked) { hci_dev_set_flag(hdev, HCI_RFKILLED); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { err = hci_dev_do_poweroff(hdev); if (err) { bt_dev_err(hdev, "Error when powering off device on rfkill (%d)", err); /* Make sure the device is still closed even if * anything during power off sequence (eg. * disconnecting devices) failed. */ hci_dev_do_close(hdev); } } } else { hci_dev_clear_flag(hdev, HCI_RFKILLED); } return 0; } static const struct rfkill_ops hci_rfkill_ops = { .set_block = hci_rfkill_set_block, }; static void hci_power_on(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_on); int err; BT_DBG("%s", hdev->name); if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); err = hci_powered_update_sync(hdev); mgmt_power_on(hdev, err); return; } err = hci_dev_do_open(hdev); if (err < 0) { hci_dev_lock(hdev); mgmt_set_powered_failed(hdev, err); hci_dev_unlock(hdev); return; } /* During the HCI setup phase, a few error conditions are * ignored and they need to be checked now. If they are still * valid, it is important to turn the device back off. */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_do_close(hdev); } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send * the Index Added event. For unconfigured devices, * it will send Unconfigued Index Added event. * * Devices with HCI_QUIRK_RAW_DEVICE are ignored * and no event will be send. */ mgmt_index_added(hdev); } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only * happens with the transition from unconfigured to * configured. This will send the Index Added event. */ mgmt_index_added(hdev); } } static void hci_power_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_off.work); BT_DBG("%s", hdev->name); hci_dev_do_close(hdev); } static void hci_error_reset(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset); hci_dev_hold(hdev); BT_DBG("%s", hdev->name); if (hdev->hw_error) hdev->hw_error(hdev, hdev->hw_error_code); else bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); if (!hci_dev_do_close(hdev)) hci_dev_do_open(hdev); hci_dev_put(hdev); } void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; list_for_each_entry_safe(uuid, tmp, &hdev->uuids, list) { list_del(&uuid->list); kfree(uuid); } } void hci_link_keys_clear(struct hci_dev *hdev) { struct link_key *key, *tmp; list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } } void hci_smp_ltks_clear(struct hci_dev *hdev) { struct smp_ltk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_smp_irks_clear(struct hci_dev *hdev) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_blocked_keys_clear(struct hci_dev *hdev) { struct blocked_key *b, *tmp; list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } } bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]) { bool blocked = false; struct blocked_key *b; rcu_read_lock(); list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) { blocked = true; break; } } rcu_read_unlock(); return blocked; } struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->link_keys, list) { if (bacmp(bdaddr, &k->bdaddr) == 0) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LINKKEY, k->val)) { bt_dev_warn_ratelimited(hdev, "Link key blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, u8 key_type, u8 old_key_type) { /* Legacy key */ if (key_type < 0x03) return true; /* Debug keys are insecure so don't store them persistently */ if (key_type == HCI_LK_DEBUG_COMBINATION) return false; /* Changed combination key and there's no previous one */ if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff) return false; /* Security mode 3 case */ if (!conn) return true; /* BR/EDR key derived using SC from an LE link */ if (conn->type == LE_LINK) return true; /* Neither local nor remote side had no-bonding as requirement */ if (conn->auth_type > 0x01 && conn->remote_auth > 0x01) return true; /* Local side had dedicated bonding as requirement */ if (conn->auth_type == 0x02 || conn->auth_type == 0x03) return true; /* Remote side had dedicated bonding as requirement */ if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) return true; /* If none of the above criteria match, then don't store the key * persistently */ return false; } static u8 ltk_role(u8 type) { if (type == SMP_LTK) return HCI_ROLE_MASTER; return HCI_ROLE_SLAVE; } struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 role) { struct smp_ltk *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (addr_type != k->bdaddr_type || bacmp(bdaddr, &k->bdaddr)) continue; if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, k->val)) { bt_dev_warn_ratelimited(hdev, "LTK blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (!bacmp(&irk->rpa, rpa)) { irk_to_return = irk; goto done; } } list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); irk_to_return = irk; goto done; } } done: if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; /* Identity Address must be public or static random */ if (addr_type == ADDR_LE_DEV_RANDOM && (bdaddr->b[5] & 0xc0) != 0xc0) return NULL; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (addr_type == irk->addr_type && bacmp(bdaddr, &irk->bdaddr) == 0) { irk_to_return = irk; break; } } if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len, bool *persistent) { struct link_key *key, *old_key; u8 old_key_type; old_key = hci_find_link_key(hdev, bdaddr); if (old_key) { old_key_type = old_key->type; key = old_key; } else { old_key_type = conn ? conn->key_type : 0xff; key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->link_keys); } BT_DBG("%s key for %pMR type %u", hdev->name, bdaddr, type); /* Some buggy controller combinations generate a changed * combination key for legacy pairing even when there's no * previous key */ if (type == HCI_LK_CHANGED_COMBINATION && (!conn || conn->remote_auth == 0xff) && old_key_type == 0xff) { type = HCI_LK_COMBINATION; if (conn) conn->key_type = type; } bacpy(&key->bdaddr, bdaddr); memcpy(key->val, val, HCI_LINK_KEY_SIZE); key->pin_len = pin_len; if (type == HCI_LK_CHANGED_COMBINATION) key->type = old_key_type; else key->type = type; if (persistent) *persistent = hci_persistent_key(hdev, conn, type, old_key_type); return key; } struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 type, u8 authenticated, u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand) { struct smp_ltk *key, *old_key; u8 role = ltk_role(type); old_key = hci_find_ltk(hdev, bdaddr, addr_type, role); if (old_key) key = old_key; else { key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->long_term_keys); } bacpy(&key->bdaddr, bdaddr); key->bdaddr_type = addr_type; memcpy(key->val, tk, sizeof(key->val)); key->authenticated = authenticated; key->ediv = ediv; key->rand = rand; key->enc_size = enc_size; key->type = type; return key; } struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 val[16], bdaddr_t *rpa) { struct smp_irk *irk; irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type); if (!irk) { irk = kzalloc(sizeof(*irk), GFP_KERNEL); if (!irk) return NULL; bacpy(&irk->bdaddr, bdaddr); irk->addr_type = addr_type; list_add_rcu(&irk->list, &hdev->identity_resolving_keys); } memcpy(irk->val, val, 16); bacpy(&irk->rpa, rpa); return irk; } int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *key; key = hci_find_link_key(hdev, bdaddr); if (!key) return -ENOENT; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&key->list); kfree_rcu(key, rcu); return 0; } int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct smp_ltk *k, *tmp; int removed = 0; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); removed++; } return removed ? 0 : -ENOENT; } void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); } } bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) { struct smp_ltk *k; struct smp_irk *irk; u8 addr_type; if (type == BDADDR_BREDR) { if (hci_find_link_key(hdev, bdaddr)) return true; return false; } /* Convert to HCI addr type which struct smp_ltk uses */ if (type == BDADDR_LE_PUBLIC) addr_type = ADDR_LE_DEV_PUBLIC; else addr_type = ADDR_LE_DEV_RANDOM; irk = hci_get_irk(hdev, bdaddr, addr_type); if (irk) { bdaddr = &irk->bdaddr; addr_type = irk->addr_type; } rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (k->bdaddr_type == addr_type && !bacmp(bdaddr, &k->bdaddr)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } /* HCI command timer function */ static void hci_cmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_timer.work); if (hdev->req_skb) { u16 opcode = hci_skb_opcode(hdev->req_skb); bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode); hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT); } else { bt_dev_err(hdev, "command tx timeout"); } if (hdev->reset) hdev->reset(hdev); atomic_set(&hdev->cmd_cnt, 1); queue_work(hdev->workqueue, &hdev->cmd_work); } /* HCI ncmd timer function */ static void hci_ncmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, ncmd_timer.work); bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0"); /* During HCI_INIT phase no events can be injected if the ncmd timer * triggers since the procedure has its own timeout handling. */ if (test_bit(HCI_INIT, &hdev->flags)) return; /* This is an irrecoverable state, inject hardware error event */ hci_reset_dev(hdev); } struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; list_for_each_entry(data, &hdev->remote_oob_data, list) { if (bacmp(bdaddr, &data->bdaddr) != 0) continue; if (data->bdaddr_type != bdaddr_type) continue; return data; } return NULL; } int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) return -ENOENT; BT_DBG("%s removing %pMR (%u)", hdev->name, bdaddr, bdaddr_type); list_del(&data->list); kfree(data); return 0; } void hci_remote_oob_data_clear(struct hci_dev *hdev) { struct oob_data *data, *n; list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) { list_del(&data->list); kfree(data); } } int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 *hash192, u8 *rand192, u8 *hash256, u8 *rand256) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) { data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; bacpy(&data->bdaddr, bdaddr); data->bdaddr_type = bdaddr_type; list_add(&data->list, &hdev->remote_oob_data); } if (hash192 && rand192) { memcpy(data->hash192, hash192, sizeof(data->hash192)); memcpy(data->rand192, rand192, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x03; } else { memset(data->hash192, 0, sizeof(data->hash192)); memset(data->rand192, 0, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x02; else data->present = 0x00; } if (hash256 && rand256) { memcpy(data->hash256, hash256, sizeof(data->hash256)); memcpy(data->rand256, rand256, sizeof(data->rand256)); } else { memset(data->hash256, 0, sizeof(data->hash256)); memset(data->rand256, 0, sizeof(data->rand256)); if (hash192 && rand192) data->present = 0x01; } BT_DBG("%s for %pMR", hdev->name, bdaddr); return 0; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; list_for_each_entry(adv_instance, &hdev->adv_instances, list) { if (adv_instance->instance == instance) return adv_instance; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid) { struct adv_info *adv; list_for_each_entry(adv, &hdev->adv_instances, list) { if (adv->sid == sid) return adv; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *cur_instance; cur_instance = hci_find_adv_instance(hdev, instance); if (!cur_instance) return NULL; if (cur_instance == list_last_entry(&hdev->adv_instances, struct adv_info, list)) return list_first_entry(&hdev->adv_instances, struct adv_info, list); else return list_next_entry(cur_instance, list); } /* This function requires the caller holds hdev->lock */ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) return -ENOENT; BT_DBG("%s removing %dMR", hdev->name, instance); if (hdev->cur_adv_instance == instance) { if (hdev->adv_instance_timeout) { cancel_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } hdev->cur_adv_instance = 0x00; } cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); hdev->adv_instance_cnt--; return 0; } void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired) { struct adv_info *adv_instance, *n; list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) adv_instance->rpa_expired = rpa_expired; } /* This function requires the caller holds hdev->lock */ void hci_adv_instances_clear(struct hci_dev *hdev) { struct adv_info *adv_instance, *n; if (hdev->adv_instance_timeout) { disable_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { disable_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); } hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; } static void adv_instance_rpa_expired(struct work_struct *work) { struct adv_info *adv_instance = container_of(work, struct adv_info, rpa_expired_cb.work); BT_DBG(""); adv_instance->rpa_expired = true; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); if (adv) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data)); } else { if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets || instance < 1 || instance > hdev->le_num_of_adv_sets + 1) return ERR_PTR(-EOVERFLOW); adv = kzalloc(sizeof(*adv), GFP_KERNEL); if (!adv) return ERR_PTR(-ENOMEM); adv->pending = true; adv->instance = instance; /* If controller support only one set and the instance is set to * 1 then there is no option other than using handle 0x00. */ if (hdev->le_num_of_adv_sets == 1 && instance == 1) adv->handle = 0x00; else adv->handle = instance; list_add(&adv->list, &hdev->adv_instances); hdev->adv_instance_cnt++; } adv->flags = flags; adv->min_interval = min_interval; adv->max_interval = max_interval; adv->tx_power = tx_power; /* Defining a mesh_handle changes the timing units to ms, * rather than seconds, and ties the instance to the requested * mesh_tx queue. */ adv->mesh = mesh_handle; hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data, scan_rsp_len, scan_rsp_data); adv->timeout = timeout; adv->remaining_time = timeout; if (duration == 0) adv->duration = hdev->def_multi_adv_rotation_duration; else adv->duration = duration; INIT_DELAYED_WORK(&adv->rpa_expired_cb, adv_instance_rpa_expired); BT_DBG("%s for %dMR", hdev->name, instance); return adv; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval) { struct adv_info *adv; adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL, 0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE, min_interval, max_interval, 0); if (IS_ERR(adv)) return adv; adv->sid = sid; adv->periodic = true; adv->per_adv_data_len = data_len; if (data) memcpy(adv->per_adv_data, data, data_len); return adv; } /* This function requires the caller holds hdev->lock */ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); /* If advertisement doesn't exist, we can't modify its data */ if (!adv) return -ENOENT; if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memcpy(adv->adv_data, adv_data, adv_data_len); adv->adv_data_len = adv_data_len; adv->adv_data_changed = true; } if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) { memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len); adv->scan_rsp_len = scan_rsp_len; adv->scan_rsp_changed = true; } /* Mark as changed if there are flags which would affect it */ if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) adv->scan_rsp_changed = true; return 0; } /* This function requires the caller holds hdev->lock */ u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance) { u32 flags; struct adv_info *adv; if (instance == 0x00) { /* Instance 0 always manages the "Tx Power" and "Flags" * fields */ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting * corresponds to the "connectable" instance flag. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) flags |= MGMT_ADV_FLAG_CONNECTABLE; if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_DISCOV; return flags; } adv = hci_find_adv_instance(hdev, instance); /* Return 0 when we got an invalid instance identifier. */ if (!adv) return 0; return adv->flags; } bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) { struct adv_info *adv; /* Instance 0x00 always set local name */ if (instance == 0x00) return true; adv = hci_find_adv_instance(hdev, instance); if (!adv) return false; if (adv->flags & MGMT_ADV_FLAG_APPEARANCE || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) return true; return adv->scan_rsp_len ? true : false; } /* This function requires the caller holds hdev->lock */ void hci_adv_monitors_clear(struct hci_dev *hdev) { struct adv_monitor *monitor; int handle; idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) hci_free_adv_monitor(hdev, monitor); idr_destroy(&hdev->adv_monitors_idr); } /* Frees the monitor structure and do some bookkeepings. * This function requires the caller holds hdev->lock. */ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct adv_pattern *pattern; struct adv_pattern *tmp; if (!monitor) return; list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) { list_del(&pattern->list); kfree(pattern); } if (monitor->handle) idr_remove(&hdev->adv_monitors_idr, monitor->handle); if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) hdev->adv_monitors_cnt--; kfree(monitor); } /* Assigns handle to a monitor, and if offloading is supported and power is on, * also attempts to forward the request to the controller. * This function requires the caller holds hci_req_sync_lock. */ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int min, max, handle; int status = 0; if (!monitor) return -EINVAL; hci_dev_lock(hdev); min = HCI_MIN_ADV_MONITOR_HANDLE; max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, GFP_KERNEL); hci_dev_unlock(hdev); if (handle < 0) return handle; monitor->handle = handle; if (!hdev_is_powered(hdev)) return status; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: bt_dev_dbg(hdev, "add monitor %d status %d", monitor->handle, status); /* Message was not forwarded to controller - not an error */ break; case HCI_ADV_MONITOR_EXT_MSFT: status = msft_add_monitor_pattern(hdev, monitor); bt_dev_dbg(hdev, "add monitor %d msft status %d", handle, status); break; } return status; } /* Attempts to tell the controller and free the monitor. If somehow the * controller doesn't have a corresponding handle, remove anyway. * This function requires the caller holds hci_req_sync_lock. */ static int hci_remove_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int status = 0; int handle; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ bt_dev_dbg(hdev, "remove monitor %d status %d", monitor->handle, status); goto free_monitor; case HCI_ADV_MONITOR_EXT_MSFT: handle = monitor->handle; status = msft_remove_monitor(hdev, monitor); bt_dev_dbg(hdev, "remove monitor %d msft status %d", handle, status); break; } /* In case no matching handle registered, just free the monitor */ if (status == -ENOENT) goto free_monitor; return status; free_monitor: if (status == -ENOENT) bt_dev_warn(hdev, "Removing monitor with no matching handle %d", monitor->handle); hci_free_adv_monitor(hdev, monitor); return status; } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle) { struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle); if (!monitor) return -EINVAL; return hci_remove_adv_monitor(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_all_adv_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; int idr_next_id = 0; int status = 0; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id); if (!monitor) break; status = hci_remove_adv_monitor(hdev, monitor); if (status) return status; idr_next_id++; } return status; } /* This function requires the caller holds hdev->lock */ bool hci_is_adv_monitoring(struct hci_dev *hdev) { return !idr_is_empty(&hdev->adv_monitors_idr); } int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev) { if (msft_monitor_supported(hdev)) return HCI_ADV_MONITOR_EXT_MSFT; return HCI_ADV_MONITOR_EXT_NONE; } struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_flags * hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_flags *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; list_for_each_entry_safe(b, n, bdaddr_list, list) { list_del(&b->list); kfree(b); } } int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type, u8 *peer_irk, u8 *local_irk) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; if (peer_irk) memcpy(entry->peer_irk, peer_irk, 16); if (local_irk) memcpy(entry->local_irk, local_irk, 16); list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, u8 type, u32 flags) { struct bdaddr_list_with_flags *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; entry->flags = flags; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; list_for_each_entry(params, &hdev->le_conn_params, list) { if (bacmp(&params->addr, addr) == 0 && params->addr_type == addr_type) { return params; } } return NULL; } /* This function requires the caller holds hdev->lock or rcu_read_lock */ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *param; rcu_read_lock(); list_for_each_entry_rcu(param, list, action) { if (bacmp(&param->addr, addr) == 0 && param->addr_type == addr_type) { rcu_read_unlock(); return param; } } rcu_read_unlock(); return NULL; } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_del_init(struct hci_conn_params *param) { if (list_empty(&param->action)) return; list_del_rcu(&param->action); synchronize_rcu(); INIT_LIST_HEAD(&param->action); } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_add(struct hci_conn_params *param, struct list_head *list) { list_add_rcu(&param->action, list); } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (params) return params; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) { bt_dev_err(hdev, "out of memory"); return NULL; } bacpy(&params->addr, addr); params->addr_type = addr_type; list_add(&params->list, &hdev->le_conn_params); INIT_LIST_HEAD(&params->action); params->conn_min_interval = hdev->le_conn_min_interval; params->conn_max_interval = hdev->le_conn_max_interval; params->conn_latency = hdev->le_conn_latency; params->supervision_timeout = hdev->le_supv_timeout; params->auto_connect = HCI_AUTO_CONN_DISABLED; BT_DBG("addr %pMR (type %u)", addr, addr_type); return params; } void hci_conn_params_free(struct hci_conn_params *params) { hci_pend_le_list_del_init(params); if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); } list_del(&params->list); kfree(params); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (!params) return; hci_conn_params_free(params); hci_update_passive_scan(hdev); BT_DBG("addr %pMR (type %u)", addr, addr_type); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_clear_disabled(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; /* If trying to establish one time connection to disabled * device, leave the params, but mark them as just once. */ if (params->explicit_connect) { params->auto_connect = HCI_AUTO_CONN_EXPLICIT; continue; } hci_conn_params_free(params); } BT_DBG("All LE disabled connection parameters were removed"); } /* This function requires the caller holds hdev->lock */ static void hci_conn_params_clear_all(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) hci_conn_params_free(params); BT_DBG("All LE connection parameters were removed"); } /* Copy the Identity Address of the controller. * * If the controller has a public BD_ADDR, then by default use that one. * If this is a LE only controller without a public address, default to * the static random address. * * For debugging purposes it is possible to force controllers with a * public address to use the static random address instead. * * In case BR/EDR has been disabled on a dual-mode controller and * userspace has configured a static address, then that address * becomes the identity address instead of the public BR/EDR address. */ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *bdaddr_type) { if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { bacpy(bdaddr, &hdev->static_addr); *bdaddr_type = ADDR_LE_DEV_RANDOM; } else { bacpy(bdaddr, &hdev->bdaddr); *bdaddr_type = ADDR_LE_DEV_PUBLIC; } } static void hci_clear_wake_reason(struct hci_dev *hdev) { hci_dev_lock(hdev); hdev->wake_reason = 0; bacpy(&hdev->wake_addr, BDADDR_ANY); hdev->wake_addr_type = 0; hci_dev_unlock(hdev); } static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct hci_dev *hdev = container_of(nb, struct hci_dev, suspend_notifier); int ret = 0; /* Userspace has full control of this device. Do nothing. */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return NOTIFY_DONE; /* To avoid a potential race with hci_unregister_dev. */ hci_dev_hold(hdev); switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: ret = hci_suspend_dev(hdev); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: ret = hci_resume_dev(hdev); break; } if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); hci_dev_put(hdev); return NOTIFY_DONE; } /* Alloc HCI device */ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) { struct hci_dev *hdev; unsigned int alloc_size; alloc_size = sizeof(*hdev); if (sizeof_priv) { /* Fixme: May need ALIGN-ment? */ alloc_size += sizeof_priv; } hdev = kzalloc(alloc_size, GFP_KERNEL); if (!hdev) return NULL; if (init_srcu_struct(&hdev->srcu)) { kfree(hdev); return NULL; } hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); hdev->num_iac = 0x01; /* One IAC support is mandatory */ hdev->io_capability = 0x03; /* No Input No Output */ hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; hdev->adv_instance_timeout = 0; hdev->advmon_allowlist_duration = 300; hdev->advmon_no_filter_duration = 500; hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */ hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; hdev->le_adv_channel_map = 0x07; hdev->le_adv_min_interval = 0x0800; hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = DISCOV_LE_SCAN_INT_FAST; hdev->le_scan_window = DISCOV_LE_SCAN_WIN_FAST; hdev->le_scan_int_suspend = DISCOV_LE_SCAN_INT_SLOW1; hdev->le_scan_window_suspend = DISCOV_LE_SCAN_WIN_SLOW1; hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; hdev->le_scan_int_adv_monitor = DISCOV_LE_SCAN_INT_FAST; hdev->le_scan_window_adv_monitor = DISCOV_LE_SCAN_WIN_FAST; hdev->le_scan_int_connect = DISCOV_LE_SCAN_INT_CONN; hdev->le_scan_window_connect = DISCOV_LE_SCAN_WIN_CONN; hdev->le_conn_min_interval = 0x0018; hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; hdev->le_supv_timeout = 0x002a; hdev->le_def_tx_len = 0x001b; hdev->le_def_tx_time = 0x0148; hdev->le_max_tx_len = 0x001b; hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE; hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; hdev->def_le_autoconnect_timeout = HCI_LE_CONN_TIMEOUT; hdev->min_le_tx_power = HCI_TX_POWER_INVALID; hdev->max_le_tx_power = HCI_TX_POWER_INVALID; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; /* default 1.28 sec page scan */ hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD; hdev->def_page_scan_int = 0x0800; hdev->def_page_scan_window = 0x0012; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); mutex_init(&hdev->mgmt_pending_lock); ida_init(&hdev->unset_handle_ida); INIT_LIST_HEAD(&hdev->mesh_pending); INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->reject_list); INIT_LIST_HEAD(&hdev->accept_list); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); INIT_LIST_HEAD(&hdev->le_accept_list); INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_LIST_HEAD(&hdev->adv_instances); INIT_LIST_HEAD(&hdev->blocked_keys); INIT_LIST_HEAD(&hdev->monitored_devices); INIT_LIST_HEAD(&hdev->local_codecs); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); INIT_WORK(&hdev->tx_work, hci_tx_work); INIT_WORK(&hdev->power_on, hci_power_on); INIT_WORK(&hdev->error_reset, hci_error_reset); hci_cmd_sync_init(hdev); INIT_DELAYED_WORK(&hdev->power_off, hci_power_off); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); skb_queue_head_init(&hdev->raw_q); init_waitqueue_head(&hdev->req_wait_q); INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout); hci_devcd_setup(hdev); hci_init_sysfs(hdev); discovery_init(hdev); return hdev; } EXPORT_SYMBOL(hci_alloc_dev_priv); /* Free HCI device */ void hci_free_dev(struct hci_dev *hdev) { /* will free via device release */ put_device(&hdev->dev); } EXPORT_SYMBOL(hci_free_dev); /* Register HCI device */ int hci_register_dev(struct hci_dev *hdev) { int id, error; if (!hdev->open || !hdev->close || !hdev->send) return -EINVAL; id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL); if (id < 0) return id; error = dev_set_name(&hdev->dev, "hci%u", id); if (error) return error; hdev->name = dev_name(&hdev->dev); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); hdev->workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->workqueue) { error = -ENOMEM; goto err; } hdev->req_workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->req_workqueue) { destroy_workqueue(hdev->workqueue); error = -ENOMEM; goto err; } if (!IS_ERR_OR_NULL(bt_debugfs)) hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); error = device_add(&hdev->dev); if (error < 0) goto err_wqueue; hci_leds_init(hdev); hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev); if (hdev->rfkill) { if (rfkill_register(hdev->rfkill) < 0) { rfkill_destroy(hdev->rfkill); hdev->rfkill = NULL; } } if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) hci_dev_set_flag(hdev, HCI_RFKILLED); hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); write_lock(&hci_dev_list_lock); list_add(&hdev->list, &hci_dev_list); write_unlock(&hci_dev_list_lock); /* Devices that are marked for raw-only usage are unconfigured * and should not be included in normal operation. */ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* Mark Remote Wakeup connection flag as supported if driver has wakeup * callback. */ if (hdev->wakeup) hdev->conn_flags |= HCI_CONN_FLAG_REMOTE_WAKEUP; hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); error = hci_register_suspend_notifier(hdev); if (error) BT_WARN("register suspend notifier failed error:%d\n", error); queue_work(hdev->req_workqueue, &hdev->power_on); idr_init(&hdev->adv_monitors_idr); msft_register(hdev); return id; err_wqueue: debugfs_remove_recursive(hdev->debugfs); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); err: ida_free(&hci_index_ida, hdev->id); return error; } EXPORT_SYMBOL(hci_register_dev); /* Unregister HCI device */ void hci_unregister_dev(struct hci_dev *hdev) { BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); mutex_lock(&hdev->unregister_lock); hci_dev_set_flag(hdev, HCI_UNREGISTER); mutex_unlock(&hdev->unregister_lock); write_lock(&hci_dev_list_lock); list_del(&hdev->list); write_unlock(&hci_dev_list_lock); synchronize_srcu(&hdev->srcu); cleanup_srcu_struct(&hdev->srcu); disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); disable_work_sync(&hdev->power_on); disable_work_sync(&hdev->error_reset); hci_cmd_sync_clear(hdev); hci_unregister_suspend_notifier(hdev); hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { hci_dev_lock(hdev); mgmt_index_removed(hdev); hci_dev_unlock(hdev); } /* mgmt_index_removed should take care of emptying the * pending list */ BUG_ON(!list_empty(&hdev->mgmt_pending)); hci_sock_dev_event(hdev, HCI_DEV_UNREG); if (hdev->rfkill) { rfkill_unregister(hdev->rfkill); rfkill_destroy(hdev->rfkill); } device_del(&hdev->dev); /* Actual cleanup is deferred until hci_release_dev(). */ hci_dev_put(hdev); } EXPORT_SYMBOL(hci_unregister_dev); /* Release HCI device */ void hci_release_dev(struct hci_dev *hdev) { debugfs_remove_recursive(hdev->debugfs); kfree_const(hdev->hw_info); kfree_const(hdev->fw_info); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); hci_bdaddr_list_clear(&hdev->reject_list); hci_bdaddr_list_clear(&hdev->accept_list); hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_adv_monitors_clear(hdev); hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); hci_codec_list_clear(&hdev->local_codecs); msft_release(hdev); hci_dev_unlock(hdev); ida_destroy(&hdev->unset_handle_ida); ida_free(&hci_index_ida, hdev->id); kfree_skb(hdev->sent_cmd); kfree_skb(hdev->req_skb); kfree_skb(hdev->recv_event); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); int hci_register_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (!hdev->suspend_notifier.notifier_call && !hci_test_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER)) { hdev->suspend_notifier.notifier_call = hci_suspend_notifier; ret = register_pm_notifier(&hdev->suspend_notifier); } return ret; } int hci_unregister_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (hdev->suspend_notifier.notifier_call) { ret = unregister_pm_notifier(&hdev->suspend_notifier); if (!ret) hdev->suspend_notifier.notifier_call = NULL; } return ret; } /* Cancel ongoing command synchronously: * * - Cancel command timer * - Reset command counter * - Cancel command request */ static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work_sync(&hdev->cmd_timer); disable_delayed_work_sync(&hdev->ncmd_timer); } else { cancel_delayed_work_sync(&hdev->cmd_timer); cancel_delayed_work_sync(&hdev->ncmd_timer); } atomic_set(&hdev->cmd_cnt, 1); hci_cmd_sync_cancel_sync(hdev, err); } /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Suspend should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to suspend */ if (mgmt_powering_down(hdev)) return 0; /* Cancel potentially blocking sync operation before suspend */ hci_cancel_cmd_sync(hdev, EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); hci_req_sync_unlock(hdev); hci_clear_wake_reason(hdev); mgmt_suspending(hdev, hdev->suspend_state); hci_sock_dev_event(hdev, HCI_DEV_SUSPEND); return ret; } EXPORT_SYMBOL(hci_suspend_dev); /* Resume HCI device */ int hci_resume_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Resume should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to resume */ if (mgmt_powering_down(hdev)) return 0; hci_req_sync_lock(hdev); ret = hci_resume_sync(hdev); hci_req_sync_unlock(hdev); mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, hdev->wake_addr_type); hci_sock_dev_event(hdev, HCI_DEV_RESUME); return ret; } EXPORT_SYMBOL(hci_resume_dev); /* Reset HCI device */ int hci_reset_dev(struct hci_dev *hdev) { static const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 }; struct sk_buff *skb; skb = bt_skb_alloc(3, GFP_ATOMIC); if (!skb) return -ENOMEM; hci_skb_pkt_type(skb) = HCI_EVENT_PKT; skb_put_data(skb, hw_err, 3); bt_dev_err(hdev, "Injecting HCI hardware error event"); /* Send Hardware Error to upper stack */ return hci_recv_frame(hdev, skb); } EXPORT_SYMBOL(hci_reset_dev); static u8 hci_dev_classify_pkt_type(struct hci_dev *hdev, struct sk_buff *skb) { if (hdev->classify_pkt_type) return hdev->classify_pkt_type(hdev, skb); return hci_skb_pkt_type(skb); } /* Receive frame from HCI drivers */ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) { u8 dev_pkt_type; if (!hdev || (!test_bit(HCI_UP, &hdev->flags) && !test_bit(HCI_INIT, &hdev->flags))) { kfree_skb(skb); return -ENXIO; } /* Check if the driver agree with packet type classification */ dev_pkt_type = hci_dev_classify_pkt_type(hdev, skb); if (hci_skb_pkt_type(skb) != dev_pkt_type) { hci_skb_pkt_type(skb) = dev_pkt_type; } switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: break; case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ if (hci_conn_num(hdev, CIS_LINK) || hci_conn_num(hdev, BIS_LINK) || hci_conn_num(hdev, PA_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); if (type == CIS_LINK || type == BIS_LINK || type == PA_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; case HCI_SCODATA_PKT: break; case HCI_ISODATA_PKT: break; case HCI_DRV_PKT: break; default: kfree_skb(skb); return -EINVAL; } /* Incoming skb */ bt_cb(skb)->incoming = 1; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_frame); /* Receive diagnostic message from HCI drivers */ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) { /* Mark as diagnostic packet */ hci_skb_pkt_type(skb) = HCI_DIAG_PKT; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_diag); void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->hw_info); hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_hw_info); void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->fw_info); hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_fw_info); /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_add_tail(&cb->list, &hci_cb_list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_register_cb); int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_del(&cb->list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_unregister_cb); static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { int err; BT_DBG("%s type %d len %d", hdev->name, hci_skb_pkt_type(skb), skb->len); /* Time stamp */ __net_timestamp(skb); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); if (!test_bit(HCI_RUNNING, &hdev->flags)) { kfree_skb(skb); return -EINVAL; } if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) { /* Intercept HCI Drv packet here and don't go with hdev->send * callback. */ err = hci_drv_process_cmd(hdev, skb); kfree_skb(skb); return err; } err = hdev->send(hdev, skb); if (err < 0) { bt_dev_err(hdev, "sending frame failed (%d)", err); kfree_skb(skb); return err; } return 0; } static int hci_send_conn_frame(struct hci_dev *hdev, struct hci_conn *conn, struct sk_buff *skb) { hci_conn_tx_queue(conn, skb); return hci_send_frame(hdev, skb); } /* Send HCI command */ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param) { struct sk_buff *skb; BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); if (!skb) { bt_dev_err(hdev, "no memory for command"); return -ENOMEM; } /* Stand-alone HCI commands must be flagged as * single-command requests. */ bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return 0; } int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param) { struct sk_buff *skb; if (hci_opcode_ogf(opcode) != 0x3f) { /* A controller receiving a command shall respond with either * a Command Status Event or a Command Complete Event. * Therefore, all standard HCI commands must be sent via the * standard API, using hci_send_cmd or hci_cmd_sync helpers. * Some vendors do not comply with this rule for vendor-specific * commands and do not return any event. We want to support * unresponded commands for such cases only. */ bt_dev_err(hdev, "unresponded command not supported"); return -EINVAL; } skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); if (!skb) { bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", opcode); return -ENOMEM; } hci_send_frame(hdev, skb); return 0; } EXPORT_SYMBOL(__hci_cmd_send); /* Get data from the previously sent command */ static void *hci_cmd_data(struct sk_buff *skb, __u16 opcode) { struct hci_command_hdr *hdr; if (!skb || skb->len < HCI_COMMAND_HDR_SIZE) return NULL; hdr = (void *)skb->data; if (hdr->opcode != cpu_to_le16(opcode)) return NULL; return skb->data + HCI_COMMAND_HDR_SIZE; } /* Get data from the previously sent command */ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) { void *data; /* Check if opcode matches last sent command */ data = hci_cmd_data(hdev->sent_cmd, opcode); if (!data) /* Check if opcode matches last request */ data = hci_cmd_data(hdev->req_skb, opcode); return data; } /* Get data from last received event */ void *hci_recv_event_data(struct hci_dev *hdev, __u8 event) { struct hci_event_hdr *hdr; int offset; if (!hdev->recv_event) return NULL; hdr = (void *)hdev->recv_event->data; offset = sizeof(*hdr); if (hdr->evt != event) { /* In case of LE metaevent check the subevent match */ if (hdr->evt == HCI_EV_LE_META) { struct hci_ev_le_meta *ev; ev = (void *)hdev->recv_event->data + offset; offset += sizeof(*ev); if (ev->subevent == event) goto found; } return NULL; } found: bt_dev_dbg(hdev, "event 0x%2.2x", event); return hdev->recv_event->data + offset; } /* Send ACL data */ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) { struct hci_acl_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ACL_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_acl_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, struct sk_buff *skb, __u16 flags) { struct hci_conn *conn = chan->conn; struct hci_dev *hdev = conn->hdev; struct sk_buff *list; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); list = skb_shinfo(skb)->frag_list; if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; /* Queue all fragments atomically. We need to use spin_lock_bh * here because of 6LoWPAN links, as there this function is * called from softirq and using normal spin lock could cause * deadlocks. */ spin_lock_bh(&queue->lock); __skb_queue_tail(queue, skb); flags &= ~ACL_START; flags |= ACL_CONT; do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); spin_unlock_bh(&queue->lock); } bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue)); } void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags) { struct hci_dev *hdev = chan->conn->hdev; BT_DBG("%s chan %p flags 0x%4.4x", hdev->name, chan, flags); hci_queue_acl(chan, &chan->data_q, skb, flags); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send SCO data */ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct hci_sco_hdr hdr; BT_DBG("%s len %d", hdev->name, skb->len); hdr.handle = cpu_to_le16(conn->handle); hdr.dlen = skb->len; skb_push(skb, HCI_SCO_HDR_SIZE); skb_reset_transport_header(skb); memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE); hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(&conn->data_q)); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send ISO data */ static void hci_add_iso_hdr(struct sk_buff *skb, __u16 handle, __u8 flags) { struct hci_iso_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ISO_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_iso_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct sk_buff *list; __u16 flags; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; list = skb_shinfo(skb)->frag_list; flags = hci_iso_flags_pack(list ? ISO_START : ISO_SINGLE, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; __skb_queue_tail(queue, skb); do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; flags = hci_iso_flags_pack(list ? ISO_CONT : ISO_END, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); } bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue)); } void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; BT_DBG("%s len %d", hdev->name, skb->len); hci_queue_iso(conn, &conn->data_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } /* ---- HCI TX task (outgoing data) ---- */ /* HCI Connection scheduler */ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) { struct hci_dev *hdev; int cnt, q; if (!conn) { *quote = 0; return; } hdev = conn->hdev; switch (conn->type) { case ACL_LINK: cnt = hdev->acl_cnt; break; case SCO_LINK: case ESCO_LINK: cnt = hdev->sco_cnt; break; case LE_LINK: cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; case CIS_LINK: case BIS_LINK: case PA_LINK: cnt = hdev->iso_cnt; break; default: cnt = 0; bt_dev_err(hdev, "unknown link type %d", conn->type); } q = cnt / num; *quote = q ? q : 1; } static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn = NULL, *c; unsigned int num = 0, min = ~0; /* We don't have to lock device here. Connections are always * added and removed with TX task disabled. */ rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != type || skb_queue_empty(&c->data_q)) continue; bt_dev_dbg(hdev, "hcon %p state %s queued %d", c, state_to_string(c->state), skb_queue_len(&c->data_q)); if (c->state != BT_CONNECTED && c->state != BT_CONFIG) continue; num++; if (c->sent < min) { min = c->sent; conn = c; } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); hci_quote_sent(conn, num, quote); BT_DBG("conn %p quote %d", conn, *quote); return conn; } static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; bt_dev_err(hdev, "link tx timeout"); hci_dev_lock(hdev); /* Kill stalled connections */ list_for_each_entry(c, &h->list, list) { if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); } } hci_dev_unlock(hdev); } static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_chan *chan = NULL; unsigned int num = 0, min = ~0, cur_prio = 0; struct hci_conn *conn; int conn_num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *tmp; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; conn_num++; list_for_each_entry_rcu(tmp, &conn->chan_list, list) { struct sk_buff *skb; if (skb_queue_empty(&tmp->data_q)) continue; skb = skb_peek(&tmp->data_q); if (skb->priority < cur_prio) continue; if (skb->priority > cur_prio) { num = 0; min = ~0; cur_prio = skb->priority; } num++; if (conn->sent < min) { min = conn->sent; chan = tmp; } } if (hci_conn_num(hdev, type) == conn_num) break; } rcu_read_unlock(); if (!chan) return NULL; hci_quote_sent(chan->conn, num, quote); BT_DBG("chan %p quote %d", chan, *quote); return chan; } static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn; int num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *chan; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; num++; list_for_each_entry_rcu(chan, &conn->chan_list, list) { struct sk_buff *skb; if (chan->sent) { chan->sent = 0; continue; } if (skb_queue_empty(&chan->data_q)) continue; skb = skb_peek(&chan->data_q); if (skb->priority >= HCI_PRIO_MAX - 1) continue; skb->priority = HCI_PRIO_MAX - 1; BT_DBG("chan %p skb %p promoted to %d", chan, skb, skb->priority); } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); } static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { unsigned long timeout; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { case ACL_LINK: /* tx timeout must be longer than maximum link supervision * timeout (40.9 seconds) */ timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT; break; case LE_LINK: /* tx timeout must be longer than maximum link supervision * timeout (40.9 seconds) */ timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT; break; case CIS_LINK: case BIS_LINK: case PA_LINK: /* tx timeout must be longer than the maximum transport latency * (8.388607 seconds) */ timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT; break; default: return; } if (!cnt && time_after(jiffies, timeout)) hci_link_tx_to(hdev, type); } /* Schedule SCO */ static void hci_sched_sco(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; unsigned int pkts = hdev->sco_pkts; bt_dev_dbg(hdev, "type %u", type); if (!hci_conn_num(hdev, type) || !pkts) return; /* Use sco_pkts if flow control has not been enabled which will limit * the amount of buffer sent in a row. */ if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) cnt = &pkts; else cnt = &hdev->sco_cnt; while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } /* Rescheduled if all packets were sent and flow control is not enabled * as there could be more packets queued that could not be sent and * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule * needs to be forced. */ if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) queue_work(hdev->workqueue, &hdev->tx_work); } static void hci_sched_acl_pkt(struct hci_dev *hdev) { unsigned int cnt = hdev->acl_cnt; struct hci_chan *chan; struct sk_buff *skb; int quote; __check_timeout(hdev, cnt, ACL_LINK); while (hdev->acl_cnt && (chan = hci_chan_sent(hdev, ACL_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_conn_enter_active_mode(chan->conn, bt_cb(skb)->force_active); hci_send_conn_frame(hdev, chan->conn, skb); hdev->acl_last_tx = jiffies; hdev->acl_cnt--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); } } if (cnt != hdev->acl_cnt) hci_prio_recalculate(hdev, ACL_LINK); } static void hci_sched_acl(struct hci_dev *hdev) { BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ if (!hci_conn_num(hdev, ACL_LINK)) return; hci_sched_acl_pkt(hdev); } static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; struct sk_buff *skb; int quote, *cnt, tmp; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, LE_LINK)) return; cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; __check_timeout(hdev, *cnt, LE_LINK); tmp = *cnt; while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_send_conn_frame(hdev, chan->conn, skb); hdev->le_last_tx = jiffies; (*cnt)--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); } } if (*cnt != tmp) hci_prio_recalculate(hdev, LE_LINK); } /* Schedule iso */ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, type)) return; cnt = &hdev->iso_cnt; __check_timeout(hdev, *cnt, type); while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); hdev->iso_last_tx = jiffies; conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } } static void hci_tx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work); struct sk_buff *skb; BT_DBG("%s acl %d sco %d le %d iso %d", hdev->name, hdev->acl_cnt, hdev->sco_cnt, hdev->le_cnt, hdev->iso_cnt); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); hci_sched_iso(hdev, CIS_LINK); hci_sched_iso(hdev, BIS_LINK); hci_sched_iso(hdev, PA_LINK); hci_sched_acl(hdev); hci_sched_le(hdev); } /* Send next queued raw (unknown type) packet */ while ((skb = skb_dequeue(&hdev->raw_q))) hci_send_frame(hdev, skb); } /* ----- HCI RX task (incoming data processing) ----- */ /* ACL data packet */ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_acl_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ACL packet too small"); goto drop; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hdev->stat.acl_rx++; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (conn) { hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF); /* Send to upper protocol */ l2cap_recv_acldata(conn, skb, flags); return; } else { bt_dev_err(hdev, "ACL packet for unknown connection handle %d", handle); } drop: kfree_skb(skb); } /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "SCO packet too small"); goto drop; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hdev->stat.sco_rx++; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (conn) { /* Send to upper protocol */ hci_skb_pkt_status(skb) = flags & 0x03; sco_recv_scodata(conn, skb); return; } else { bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", handle); } drop: kfree_skb(skb); } static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_iso_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ISO packet too small"); goto drop; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (!conn) { bt_dev_err(hdev, "ISO packet for unknown connection handle %d", handle); goto drop; } /* Send to upper protocol */ iso_recv(conn, skb, flags); return; drop: kfree_skb(skb); } static bool hci_req_is_complete(struct hci_dev *hdev) { struct sk_buff *skb; skb = skb_peek(&hdev->cmd_q); if (!skb) return true; return (bt_cb(skb)->hci.req_flags & HCI_REQ_START); } static void hci_resend_last(struct hci_dev *hdev) { struct hci_command_hdr *sent; struct sk_buff *skb; u16 opcode; if (!hdev->sent_cmd) return; sent = (void *) hdev->sent_cmd->data; opcode = __le16_to_cpu(sent->opcode); if (opcode == HCI_OP_RESET) return; skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); if (!skb) return; skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb) { struct sk_buff *skb; unsigned long flags; BT_DBG("opcode 0x%04x status 0x%02x", opcode, status); /* If the completed command doesn't match the last one that was * sent we need to do special handling of it. */ if (!hci_sent_cmd_data(hdev, opcode)) { /* Some CSR based controllers generate a spontaneous * reset complete event during init and any pending * command will never be completed. In such a case we * need to resend whatever was the last sent * command. */ if (test_bit(HCI_INIT, &hdev->flags) && opcode == HCI_OP_RESET) hci_resend_last(hdev); return; } /* If we reach this point this event matches the last command sent */ hci_dev_clear_flag(hdev, HCI_CMD_PENDING); /* If the command succeeded and there's still more commands in * this request the request is not yet complete. */ if (!status && !hci_req_is_complete(hdev)) return; skb = hdev->req_skb; /* If this was the last command in a request the complete * callback would be found in hdev->req_skb instead of the * command queue (hdev->cmd_q). */ if (skb && bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) { *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; return; } if (skb && bt_cb(skb)->hci.req_complete) { *req_complete = bt_cb(skb)->hci.req_complete; return; } /* Remove all pending commands belonging to this request */ spin_lock_irqsave(&hdev->cmd_q.lock, flags); while ((skb = __skb_dequeue(&hdev->cmd_q))) { if (bt_cb(skb)->hci.req_flags & HCI_REQ_START) { __skb_queue_head(&hdev->cmd_q, skb); break; } if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; else *req_complete = bt_cb(skb)->hci.req_complete; dev_kfree_skb_irq(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); } static void hci_rx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, rx_work); struct sk_buff *skb; BT_DBG("%s", hdev->name); /* The kcov_remote functions used for collecting packet parsing * coverage information from this background thread and associate * the coverage with the syscall's thread which originally injected * the packet. This helps fuzzing the kernel. */ for (; (skb = skb_dequeue(&hdev->rx_q)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* If the device has been opened in HCI_USER_CHANNEL, * the userspace has exclusive access to device. * When device is HCI_INIT, we still need to process * the data packets to the driver in order * to complete its setup(). */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } if (test_bit(HCI_INIT, &hdev->flags)) { /* Don't process data packets in this states. */ switch (hci_skb_pkt_type(skb)) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: case HCI_ISODATA_PKT: kfree_skb(skb); continue; } } /* Process frame */ switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: BT_DBG("%s Event packet", hdev->name); hci_event_packet(hdev, skb); break; case HCI_ACLDATA_PKT: BT_DBG("%s ACL data packet", hdev->name); hci_acldata_packet(hdev, skb); break; case HCI_SCODATA_PKT: BT_DBG("%s SCO data packet", hdev->name); hci_scodata_packet(hdev, skb); break; case HCI_ISODATA_PKT: BT_DBG("%s ISO data packet", hdev->name); hci_isodata_packet(hdev, skb); break; default: kfree_skb(skb); break; } } } static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) { int err; bt_dev_dbg(hdev, "skb %p", skb); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = skb_clone(skb, GFP_KERNEL); if (!hdev->sent_cmd) { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return; } if (hci_skb_opcode(skb) != HCI_OP_NOP) { err = hci_send_frame(hdev, skb); if (err < 0) { hci_cmd_sync_cancel_sync(hdev, -err); return; } atomic_dec(&hdev->cmd_cnt); } if (hdev->req_status == HCI_REQ_PEND && !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) { kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); } } static void hci_cmd_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work); struct sk_buff *skb; BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name, atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q)); /* Send queued commands */ if (atomic_read(&hdev->cmd_cnt)) { skb = skb_dequeue(&hdev->cmd_q); if (!skb) return; hci_send_cmd_sync(hdev, skb); rcu_read_lock(); if (test_bit(HCI_RESET, &hdev->flags) || hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) cancel_delayed_work(&hdev->cmd_timer); else queue_delayed_work(hdev->workqueue, &hdev->cmd_timer, HCI_CMD_TIMEOUT); rcu_read_unlock(); } }
364 60 389 301 7 22 71 20 295 172 173 190 283 17 69 296 294 69 29 24 190 192 13 6 1 11 12 173 171 173 12 171 173 194 18 168 32 44 165 32 138 298 298 53 248 298 298 299 216 98 226 226 226 226 41 189 226 225 226 225 226 5 222 5 19 1 13 13 2 12 11 12 19 1 1 4 1 4 4 23 4 19 4 3 3 23 4 3 413 414 19 399 400 395 415 192 47 47 463 283 4 282 50 49 50 209 462 4 511 5 3 24 337 44 44 18 13 260 247 18 223 220 215 3 442 139 431 260 18 248 570 1 1 565 1 511 561 4 4 4 564 134 442 563 120 448 337 53 244 90 247 578 1 571 7 296 337 294 260 337 578 11 11 11 11 225 226 225 223 223 3 218 3 2 2 2 2 5 4 4 3 3 1 5 3 4 4 1 7 3 7 1 7 6 1 27 18 8 1 9 12 11 2 7 1 13 3 260 3 216 578 342 260 260 611 610 1 6 579 22 5 3 2 1 30 1 1 6 2 19 17 2 18 1 18 1 19 12 11 11 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_api.c Packet action API. * * Author: Jamal Hadi Salim */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_pedit.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/tc_wrapper.h> #ifdef CONFIG_INET DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count); EXPORT_SYMBOL_GPL(tcf_frag_xmit_count); #endif int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)) { #ifdef CONFIG_INET if (static_branch_unlikely(&tcf_frag_xmit_count)) return sch_frag_xmit_hook(skb, xmit); #endif return xmit(skb); } EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit); static void tcf_action_goto_chain_exec(const struct tc_action *a, struct tcf_result *res) { const struct tcf_chain *chain = rcu_dereference_bh(a->goto_chain); res->goto_tp = rcu_dereference_bh(chain->filter_chain); } static void tcf_free_cookie_rcu(struct rcu_head *p) { struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); kfree(cookie->data); kfree(cookie); } static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, struct tc_cookie *new_cookie) { struct tc_cookie *old; old = unrcu_pointer(xchg(old_cookie, RCU_INITIALIZER(new_cookie))); if (old) call_rcu(&old->rcu, tcf_free_cookie_rcu); } int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, struct tcf_chain **newchain, struct netlink_ext_ack *extack) { int opcode = TC_ACT_EXT_OPCODE(action), ret = -EINVAL; u32 chain_index; if (!opcode) ret = action > TC_ACT_VALUE_MAX ? -EINVAL : 0; else if (opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC) ret = 0; if (ret) { NL_SET_ERR_MSG(extack, "invalid control action"); goto end; } if (TC_ACT_EXT_CMP(action, TC_ACT_GOTO_CHAIN)) { chain_index = action & TC_ACT_EXT_VAL_MASK; if (!tp || !newchain) { ret = -EINVAL; NL_SET_ERR_MSG(extack, "can't goto NULL proto/chain"); goto end; } *newchain = tcf_chain_get_by_act(tp->chain->block, chain_index); if (!*newchain) { ret = -ENOMEM; NL_SET_ERR_MSG(extack, "can't allocate goto_chain"); } } end: return ret; } EXPORT_SYMBOL(tcf_action_check_ctrlact); struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, struct tcf_chain *goto_chain) { a->tcfa_action = action; goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1); return goto_chain; } EXPORT_SYMBOL(tcf_action_set_ctrlact); /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected * from filters. Readers later can not find us. */ static void free_tcf(struct tc_action *p) { struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1); free_percpu(p->cpu_bstats); free_percpu(p->cpu_bstats_hw); free_percpu(p->cpu_qstats); tcf_set_action_cookie(&p->user_cookie, NULL); if (chain) tcf_chain_put_by_act(chain); kfree(p); } static void offload_action_hw_count_set(struct tc_action *act, u32 hw_count) { act->in_hw_count = hw_count; } static void offload_action_hw_count_inc(struct tc_action *act, u32 hw_count) { act->in_hw_count += hw_count; } static void offload_action_hw_count_dec(struct tc_action *act, u32 hw_count) { act->in_hw_count = act->in_hw_count > hw_count ? act->in_hw_count - hw_count : 0; } static unsigned int tcf_offload_act_num_actions_single(struct tc_action *act) { if (is_tcf_pedit(act)) return tcf_pedit_nkeys(act); else return 1; } static bool tc_act_skip_hw(u32 flags) { return (flags & TCA_ACT_FLAGS_SKIP_HW) ? true : false; } static bool tc_act_skip_sw(u32 flags) { return (flags & TCA_ACT_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static bool tc_act_flags_valid(u32 flags) { flags &= TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW; return flags ^ (TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW); } static int offload_action_init(struct flow_offload_action *fl_action, struct tc_action *act, enum offload_act_command cmd, struct netlink_ext_ack *extack) { int err; fl_action->extack = extack; fl_action->command = cmd; fl_action->index = act->tcfa_index; fl_action->cookie = (unsigned long)act; if (act->ops->offload_act_setup) { spin_lock_bh(&act->tcfa_lock); err = act->ops->offload_act_setup(act, fl_action, NULL, false, extack); spin_unlock_bh(&act->tcfa_lock); return err; } return -EOPNOTSUPP; } static int tcf_action_offload_cmd_ex(struct flow_offload_action *fl_act, u32 *hw_count) { int err; err = flow_indr_dev_setup_offload(NULL, NULL, TC_SETUP_ACT, fl_act, NULL, NULL); if (err < 0) return err; if (hw_count) *hw_count = err; return 0; } static int tcf_action_offload_cmd_cb_ex(struct flow_offload_action *fl_act, u32 *hw_count, flow_indr_block_bind_cb_t *cb, void *cb_priv) { int err; err = cb(NULL, NULL, cb_priv, TC_SETUP_ACT, NULL, fl_act, NULL); if (err < 0) return err; if (hw_count) *hw_count = 1; return 0; } static int tcf_action_offload_cmd(struct flow_offload_action *fl_act, u32 *hw_count, flow_indr_block_bind_cb_t *cb, void *cb_priv) { return cb ? tcf_action_offload_cmd_cb_ex(fl_act, hw_count, cb, cb_priv) : tcf_action_offload_cmd_ex(fl_act, hw_count); } static int tcf_action_offload_add_ex(struct tc_action *action, struct netlink_ext_ack *extack, flow_indr_block_bind_cb_t *cb, void *cb_priv) { bool skip_sw = tc_act_skip_sw(action->tcfa_flags); struct tc_action *actions[TCA_ACT_MAX_PRIO] = { [0] = action, }; struct flow_offload_action *fl_action; u32 in_hw_count = 0; int num, err = 0; if (tc_act_skip_hw(action->tcfa_flags)) return 0; num = tcf_offload_act_num_actions_single(action); fl_action = offload_action_alloc(num); if (!fl_action) return -ENOMEM; err = offload_action_init(fl_action, action, FLOW_ACT_REPLACE, extack); if (err) goto fl_err; err = tc_setup_action(&fl_action->action, actions, 0, extack); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to setup tc actions for offload"); goto fl_err; } err = tcf_action_offload_cmd(fl_action, &in_hw_count, cb, cb_priv); if (!err) cb ? offload_action_hw_count_inc(action, in_hw_count) : offload_action_hw_count_set(action, in_hw_count); if (skip_sw && !tc_act_in_hw(action)) err = -EINVAL; tc_cleanup_offload_action(&fl_action->action); fl_err: kfree(fl_action); return err; } /* offload the tc action after it is inserted */ static int tcf_action_offload_add(struct tc_action *action, struct netlink_ext_ack *extack) { return tcf_action_offload_add_ex(action, extack, NULL, NULL); } int tcf_action_update_hw_stats(struct tc_action *action) { struct flow_offload_action fl_act = {}; int err; err = offload_action_init(&fl_act, action, FLOW_ACT_STATS, NULL); if (err) return err; err = tcf_action_offload_cmd(&fl_act, NULL, NULL, NULL); if (!err) { preempt_disable(); tcf_action_stats_update(action, fl_act.stats.bytes, fl_act.stats.pkts, fl_act.stats.drops, fl_act.stats.lastused, true); preempt_enable(); action->used_hw_stats = fl_act.stats.used_hw_stats; action->used_hw_stats_valid = true; } else { return -EOPNOTSUPP; } return 0; } EXPORT_SYMBOL(tcf_action_update_hw_stats); static int tcf_action_offload_del_ex(struct tc_action *action, flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_offload_action fl_act = {}; u32 in_hw_count = 0; int err = 0; if (!tc_act_in_hw(action)) return 0; err = offload_action_init(&fl_act, action, FLOW_ACT_DESTROY, NULL); if (err) return err; err = tcf_action_offload_cmd(&fl_act, &in_hw_count, cb, cb_priv); if (err < 0) return err; if (!cb && action->in_hw_count != in_hw_count) return -EINVAL; /* do not need to update hw state when deleting action */ if (cb && in_hw_count) offload_action_hw_count_dec(action, in_hw_count); return 0; } static int tcf_action_offload_del(struct tc_action *action) { return tcf_action_offload_del_ex(action, NULL, NULL); } static void tcf_action_cleanup(struct tc_action *p) { tcf_action_offload_del(p); if (p->ops->cleanup) p->ops->cleanup(p); gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } static int __tcf_action_put(struct tc_action *p, bool bind) { struct tcf_idrinfo *idrinfo = p->idrinfo; if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) { if (bind) atomic_dec(&p->tcfa_bindcnt); idr_remove(&idrinfo->action_idr, p->tcfa_index); mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); return 1; } if (bind) atomic_dec(&p->tcfa_bindcnt); return 0; } static int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) { int ret = 0; /* Release with strict==1 and bind==0 is only called through act API * interface (classifiers always bind). Only case when action with * positive reference count and zero bind count can exist is when it was * also created with act API (unbinding last classifier will destroy the * action if it was created by classifier). So only case when bind count * can be changed after initial check is when unbound action is * destroyed by act API while classifier binds to action with same id * concurrently. This result either creation of new action(same behavior * as before), or reusing existing action if concurrent process * increments reference count before action is deleted. Both scenarios * are acceptable. */ if (p) { if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; if (__tcf_action_put(p, bind)) ret = ACT_P_DELETED; } return ret; } int tcf_idr_release(struct tc_action *a, bool bind) { const struct tc_action_ops *ops = a->ops; int ret; ret = __tcf_idr_release(a, bind, false); if (ret == ACT_P_DELETED) module_put(ops->owner); return ret; } EXPORT_SYMBOL(tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { struct tc_cookie *user_cookie; u32 cookie_len = 0; rcu_read_lock(); user_cookie = rcu_dereference(act->user_cookie); if (user_cookie) cookie_len = nla_total_size(user_cookie->len); rcu_read_unlock(); return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS */ + nla_total_size(0) /* TCA_ACT_STATS nested */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) /* TCA_STATS_PKT64 */ + nla_total_size_64bit(sizeof(u64)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_ACT_OPTIONS nested */ + nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */ } static size_t tcf_action_full_attrs_size(size_t sz) { return NLMSG_HDRLEN /* struct nlmsghdr */ + sizeof(struct tcamsg) + nla_total_size(0) /* TCA_ACT_TAB nested */ + sz; } static size_t tcf_action_fill_size(const struct tc_action *act) { size_t sz = tcf_action_shared_attrs_size(act); if (act->ops->get_fill_size) return act->ops->get_fill_size(act) + sz; return sz; } static int tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act) { unsigned char *b = skb_tail_pointer(skb); struct tc_cookie *cookie; if (nla_put_string(skb, TCA_ACT_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; if (from_act && nla_put_u32(skb, TCA_ACT_INDEX, a->tcfa_index)) goto nla_put_failure; rcu_read_lock(); cookie = rcu_dereference(a->user_cookie); if (cookie) { if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { rcu_read_unlock(); goto nla_put_failure; } } rcu_read_unlock(); return 0; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; int err = -EINVAL; u32 flags; if (tcf_action_dump_terse(skb, a, false)) goto nla_put_failure; if (a->hw_stats != TCA_ACT_HW_STATS_ANY && nla_put_bitfield32(skb, TCA_ACT_HW_STATS, a->hw_stats, TCA_ACT_HW_STATS_ANY)) goto nla_put_failure; if (a->used_hw_stats_valid && nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) goto nla_put_failure; flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; if (flags && nla_put_bitfield32(skb, TCA_ACT_FLAGS, flags, flags)) goto nla_put_failure; if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); if (nest == NULL) goto nla_put_failure; err = tcf_action_dump_old(skb, a, bind, ref); if (err > 0) { nla_nest_end(skb, nest); return err; } nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { int err = 0, index = -1, s_i = 0, n_i = 0; u32 act_flags = cb->args[2]; unsigned long jiffy_since = cb->args[3]; struct nlattr *nest; struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; unsigned long tmp; mutex_lock(&idrinfo->lock); s_i = cb->args[0]; idr_for_each_entry_ul(idr, p, tmp, id) { index++; if (index < s_i) continue; if (IS_ERR(p)) continue; if (jiffy_since && time_after(jiffy_since, (unsigned long)p->tcfa_tm.lastuse)) continue; tcf_action_update_hw_stats(p); nest = nla_nest_start_noflag(skb, n_i); if (!nest) { index--; goto nla_put_failure; } err = (act_flags & TCA_ACT_FLAG_TERSE_DUMP) ? tcf_action_dump_terse(skb, p, true) : tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); goto done; } nla_nest_end(skb, nest); n_i++; if (!(act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) && n_i >= TCA_ACT_MAX_PRIO) goto done; } done: if (index >= 0) cb->args[0] = index + 1; mutex_unlock(&idrinfo->lock); if (n_i) { if (act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; } return n_i; nla_put_failure: nla_nest_cancel(skb, nest); goto done; } static int tcf_idr_release_unsafe(struct tc_action *p) { if (atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; if (refcount_dec_and_test(&p->tcfa_refcnt)) { idr_remove(&p->idrinfo->action_idr, p->tcfa_index); tcf_action_cleanup(p); return ACT_P_DELETED; } return 0; } static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct nlattr *nest; int n_i = 0; int ret = -EINVAL; struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; unsigned long tmp; nest = nla_nest_start_noflag(skb, 0); if (nest == NULL) goto nla_put_failure; if (nla_put_string(skb, TCA_ACT_KIND, ops->kind)) goto nla_put_failure; ret = 0; mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p)) continue; ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) break; n_i++; } mutex_unlock(&idrinfo->lock); if (ret < 0) { if (n_i) NL_SET_ERR_MSG(extack, "Unable to flush all TC actions"); else goto nla_put_failure; } ret = nla_put_u32(skb, TCA_FCNT, n_i); if (ret) goto nla_put_failure; nla_nest_end(skb, nest); return n_i; nla_put_failure: nla_nest_cancel(skb, nest); return ret; } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tcf_idrinfo *idrinfo = tn->idrinfo; if (type == RTM_DELACTION) { return tcf_del_walker(idrinfo, skb, ops, extack); } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { WARN(1, "tcf_generic_walker: unknown command %d\n", type); NL_SET_ERR_MSG(extack, "tcf_generic_walker: unknown command"); return -EINVAL; } } EXPORT_SYMBOL(tcf_generic_walker); int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (IS_ERR(p)) p = NULL; else if (p) refcount_inc(&p->tcfa_refcnt); mutex_unlock(&idrinfo->lock); if (p) { *a = p; return true; } return false; } EXPORT_SYMBOL(tcf_idr_search); static int __tcf_generic_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ops->net_id); if (unlikely(ops->walk)) return ops->walk(net, skb, cb, type, ops, extack); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int __tcf_idr_search(struct net *net, const struct tc_action_ops *ops, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ops->net_id); if (unlikely(ops->lookup)) return ops->lookup(net, a, index); return tcf_idr_search(tn, a, index); } static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) { struct tc_action *p; int ret = 0; mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (!p) { mutex_unlock(&idrinfo->lock); return -ENOENT; } if (!atomic_read(&p->tcfa_bindcnt)) { if (refcount_dec_and_test(&p->tcfa_refcnt)) { struct module *owner = p->ops->owner; WARN_ON(p != idr_remove(&idrinfo->action_idr, p->tcfa_index)); mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); module_put(owner); return 0; } ret = 0; } else { ret = -EPERM; } mutex_unlock(&idrinfo->lock); return ret; } int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats, u32 flags) { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; refcount_set(&p->tcfa_refcnt, 1); if (bind) atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats) goto err1; p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats_hw) goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) goto err3; } gnet_stats_basic_sync_init(&p->tcfa_bstats); gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; p->tcfa_flags = flags; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, &p->tcfa_lock, false, est); if (err) goto err4; } p->idrinfo = idrinfo; __module_get(ops->owner); p->ops = ops; *a = p; return 0; err4: free_percpu(p->cpu_qstats); err3: free_percpu(p->cpu_bstats_hw); err2: free_percpu(p->cpu_bstats); err1: kfree(p); return err; } EXPORT_SYMBOL(tcf_idr_create); int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags) { /* Set cpustats according to actions flags. */ return tcf_idr_create(tn, index, est, a, ops, bind, !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags); } EXPORT_SYMBOL(tcf_idr_create_from_flags); /* Cleanup idr index that was allocated but not initialized. */ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; mutex_lock(&idrinfo->lock); /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); mutex_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_cleanup); /* Check if action with specified index exists. If actions is found, increments * its reference and bind counters, and return 1. Otherwise insert temporary * error pointer (to prevent concurrent users from inserting actions with same * index) and return 0. * * May return -EAGAIN for binding actions in case of a parallel add/delete on * the requested index. */ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind) { struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; int ret; u32 max; if (*index) { rcu_read_lock(); p = idr_find(&idrinfo->action_idr, *index); if (IS_ERR(p)) { /* This means that another process allocated * index but did not assign the pointer yet. */ rcu_read_unlock(); return -EAGAIN; } if (!p) { /* Empty slot, try to allocate it */ max = *index; rcu_read_unlock(); goto new; } if (!refcount_inc_not_zero(&p->tcfa_refcnt)) { /* Action was deleted in parallel */ rcu_read_unlock(); return -EAGAIN; } if (bind) atomic_inc(&p->tcfa_bindcnt); *a = p; rcu_read_unlock(); return 1; } else { /* Find a slot */ *index = 1; max = UINT_MAX; } new: *a = NULL; mutex_lock(&idrinfo->lock); ret = idr_alloc_u32(&idrinfo->action_idr, ERR_PTR(-EBUSY), index, max, GFP_KERNEL); mutex_unlock(&idrinfo->lock); /* N binds raced for action allocation, * retry for all the ones that failed. */ if (ret == -ENOSPC && *index == max) ret = -EAGAIN; return ret; } EXPORT_SYMBOL(tcf_idr_check_alloc); void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { struct idr *idr = &idrinfo->action_idr; bool mutex_taken = false; struct tc_action *p; unsigned long id = 1; unsigned long tmp; int ret; idr_for_each_entry_ul(idr, p, tmp, id) { if (tc_act_in_hw(p) && !mutex_taken) { rtnl_lock(); mutex_taken = true; } ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return; } if (mutex_taken) rtnl_unlock(); idr_destroy(&idrinfo->action_idr); } EXPORT_SYMBOL(tcf_idrinfo_destroy); static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); /* since act ops id is stored in pernet subsystem list, * then there is no way to walk through only all the action * subsystem, so we keep tc action pernet ops id for * reoffload to walk through. */ static LIST_HEAD(act_pernet_id_list); static DEFINE_MUTEX(act_id_mutex); struct tc_act_pernet_id { struct list_head list; unsigned int id; }; static int tcf_pernet_add_id_list(unsigned int id) { struct tc_act_pernet_id *id_ptr; int ret = 0; mutex_lock(&act_id_mutex); list_for_each_entry(id_ptr, &act_pernet_id_list, list) { if (id_ptr->id == id) { ret = -EEXIST; goto err_out; } } id_ptr = kzalloc(sizeof(*id_ptr), GFP_KERNEL); if (!id_ptr) { ret = -ENOMEM; goto err_out; } id_ptr->id = id; list_add_tail(&id_ptr->list, &act_pernet_id_list); err_out: mutex_unlock(&act_id_mutex); return ret; } static void tcf_pernet_del_id_list(unsigned int id) { struct tc_act_pernet_id *id_ptr; mutex_lock(&act_id_mutex); list_for_each_entry(id_ptr, &act_pernet_id_list, list) { if (id_ptr->id == id) { list_del(&id_ptr->list); kfree(id_ptr); break; } } mutex_unlock(&act_id_mutex); } int tcf_register_action(struct tc_action_ops *act, struct pernet_operations *ops) { struct tc_action_ops *a; int ret; if (!act->act || !act->dump || !act->init) return -EINVAL; /* We have to register pernet ops before making the action ops visible, * otherwise tcf_action_init_1() could get a partially initialized * netns. */ ret = register_pernet_subsys(ops); if (ret) return ret; if (ops->id) { ret = tcf_pernet_add_id_list(*ops->id); if (ret) goto err_id; } write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (act->id == a->id || (strcmp(act->kind, a->kind) == 0)) { ret = -EEXIST; goto err_out; } } list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); return 0; err_out: write_unlock(&act_mod_lock); if (ops->id) tcf_pernet_del_id_list(*ops->id); err_id: unregister_pernet_subsys(ops); return ret; } EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act, struct pernet_operations *ops) { struct tc_action_ops *a; int err = -ENOENT; write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (a == act) { list_del(&act->head); err = 0; break; } } write_unlock(&act_mod_lock); if (!err) { unregister_pernet_subsys(ops); if (ops->id) tcf_pernet_del_id_list(*ops->id); } return err; } EXPORT_SYMBOL(tcf_unregister_action); /* lookup by name */ static struct tc_action_ops *tc_lookup_action_n(char *kind) { struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (strcmp(kind, a->kind) == 0) { if (try_module_get(a->owner)) res = a; break; } } read_unlock(&act_mod_lock); } return res; } /* lookup by nlattr */ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) { struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (nla_strcmp(kind, a->kind) == 0) { if (try_module_get(a->owner)) res = a; break; } } read_unlock(&act_mod_lock); } return res; } /*TCA_ACT_MAX_PRIO is 32, there count up to 32 */ #define TCA_ACT_MAX_PRIO_MASK 0x1FF int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res) { u32 jmp_prgcnt = 0; u32 jmp_ttl = TCA_ACT_MAX_PRIO; /*matches actions per filter */ int i; int ret = TC_ACT_OK; if (skb_skip_tc_classify(skb)) return TC_ACT_OK; restart_act_graph: for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; int repeat_ttl; if (jmp_prgcnt > 0) { jmp_prgcnt -= 1; continue; } if (tc_act_skip_sw(a->tcfa_flags)) continue; repeat_ttl = 32; repeat: ret = tc_act(skb, a, res); if (unlikely(ret == TC_ACT_REPEAT)) { if (--repeat_ttl != 0) goto repeat; /* suspicious opcode, stop pipeline */ net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n"); return TC_ACT_OK; } if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) { /* faulty opcode, stop pipeline */ return TC_ACT_OK; } else { jmp_ttl -= 1; if (jmp_ttl > 0) goto restart_act_graph; else /* faulty graph, stop pipeline */ return TC_ACT_OK; } } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { if (unlikely(!rcu_access_pointer(a->goto_chain))) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_CHAIN_NOTFOUND); return TC_ACT_SHOT; } tcf_action_goto_chain_exec(a, res); } if (ret != TC_ACT_PIPE) break; } return ret; } EXPORT_SYMBOL(tcf_action_exec); int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; struct tc_action *a; int ret = 0, i; tcf_act_for_each_action(i, a, actions) { actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return ret; } return ret; } static int tcf_action_put(struct tc_action *p) { return __tcf_action_put(p, false); } static void tcf_action_put_many(struct tc_action *actions[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; if (tcf_action_put(a)) module_put(ops->owner); } } static void tca_put_bound_many(struct tc_action *actions[], int init_res[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; if (init_res[i] == ACT_P_CREATED) continue; if (tcf_action_put(a)) module_put(ops->owner); } } int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { return a->ops->dump(skb, a, bind, ref); } int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse) { struct tc_action *a; int err = -EINVAL, i; struct nlattr *nest; tcf_act_for_each_action(i, a, actions) { nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; err = terse ? tcf_action_dump_terse(skb, a, false) : tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; nla_nest_end(skb, nest); } return 0; nla_put_failure: err = -EINVAL; errout: nla_nest_cancel(skb, nest); return err; } static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) { struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return NULL; c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL); if (!c->data) { kfree(c); return NULL; } c->len = nla_len(tb[TCA_ACT_COOKIE]); return c; } static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) { struct nla_bitfield32 hw_stats_bf; /* If the user did not pass the attr, that means he does * not care about the type. Return "any" in that case * which is setting on all supported types. */ if (!hw_stats_attr) return TCA_ACT_HW_STATS_ANY; hw_stats_bf = nla_get_bitfield32(hw_stats_attr); return hw_stats_bf.value; } static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS | TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW), [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { struct tcf_idrinfo *idrinfo; if (init_res[i] == ACT_P_BOUND) continue; idrinfo = a->idrinfo; mutex_lock(&idrinfo->lock); /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ idr_replace(&idrinfo->action_idr, a, a->tcfa_index); mutex_unlock(&idrinfo->lock); } } struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack) { bool police = flags & TCA_ACT_FLAGS_POLICE; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action_ops *a_o; char act_name[IFNAMSIZ]; struct nlattr *kind; int err; if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) return ERR_PTR(err); err = -EINVAL; kind = tb[TCA_ACT_KIND]; if (!kind) { NL_SET_ERR_MSG(extack, "TC action kind must be specified"); return ERR_PTR(err); } if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(err); } } else { if (strscpy(act_name, "police", IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(-EINVAL); } } a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL); if (rtnl_held) rtnl_unlock(); request_module(NET_ACT_ALIAS_PREFIX "%s", act_name); if (rtnl_held) rtnl_lock(); a_o = tc_lookup_action_n(act_name); /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to * tell the caller to replay the request. We * indicate this using -EAGAIN. */ if (a_o != NULL) { module_put(a_o->owner); return ERR_PTR(-EAGAIN); } #endif NL_SET_ERR_MSG(extack, "Failed to load TC action module"); return ERR_PTR(-ENOENT); } return a_o; } struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action_ops *a_o, int *init_res, u32 flags, struct netlink_ext_ack *extack) { bool police = flags & TCA_ACT_FLAGS_POLICE; struct nla_bitfield32 userflags = { 0, 0 }; struct tc_cookie *user_cookie = NULL; u8 hw_stats = TCA_ACT_HW_STATS_ANY; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; int err; /* backward compatibility for policer */ if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) return ERR_PTR(err); if (tb[TCA_ACT_COOKIE]) { user_cookie = nla_memdup_cookie(tb); if (!user_cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); err = -ENOMEM; goto err_out; } } hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); if (tb[TCA_ACT_FLAGS]) { userflags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); if (!tc_act_flags_valid(userflags.value)) { err = -EINVAL; goto err_out; } } err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, tp, userflags.value | flags, extack); } else { err = a_o->init(net, nla, est, &a, tp, userflags.value | flags, extack); } if (err < 0) goto err_out; *init_res = err; if (!police && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->user_cookie, user_cookie); if (!police) a->hw_stats = hw_stats; return a; err_out: if (user_cookie) { kfree(user_cookie->data); kfree(user_cookie); } return ERR_PTR(err); } static bool tc_act_bind(u32 flags) { return !!(flags & TCA_ACT_FLAGS_BIND); } /* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action *actions[], int init_res[], size_t *attr_size, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; struct nlattr *tb[TCA_ACT_MAX_PRIO + 2]; struct tc_action *act; size_t sz = 0; int err; int i; err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL, extack); if (err < 0) return err; /* The nested attributes are parsed as types, but they are really an * array of actions. So we parse one more than we can handle, and return * an error if the last one is set (as that indicates that the request * contained more than the maximum number of actions). */ if (tb[TCA_ACT_MAX_PRIO + 1]) { NL_SET_ERR_MSG_FMT(extack, "Only %d actions supported per filter", TCA_ACT_MAX_PRIO); return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; a_o = tc_action_load_ops(tb[i], flags, extack); if (IS_ERR(a_o)) { err = PTR_ERR(a_o); goto err_mod; } ops[i - 1] = a_o; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, ops[i - 1], &init_res[i - 1], flags, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; } sz += tcf_action_fill_size(act); /* Start from index 0 */ actions[i - 1] = act; if (tc_act_bind(flags)) { bool skip_sw = tc_skip_sw(fl_flags); bool skip_hw = tc_skip_hw(fl_flags); if (tc_act_bind(act->tcfa_flags)) { /* Action is created by classifier and is not * standalone. Check that the user did not set * any action flags different than the * classifier flags, and inherit the flags from * the classifier for the compatibility case * where no flags were specified at all. */ if ((tc_act_skip_sw(act->tcfa_flags) && !skip_sw) || (tc_act_skip_hw(act->tcfa_flags) && !skip_hw)) { NL_SET_ERR_MSG(extack, "Mismatch between action and filter offload flags"); err = -EINVAL; goto err; } if (skip_sw) act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_SW; if (skip_hw) act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_HW; continue; } /* Action is standalone */ if (skip_sw != tc_act_skip_sw(act->tcfa_flags) || skip_hw != tc_act_skip_hw(act->tcfa_flags)) { NL_SET_ERR_MSG(extack, "Mismatch between action and filter offload flags"); err = -EINVAL; goto err; } } else { err = tcf_action_offload_add(act, extack); if (tc_act_skip_sw(act->tcfa_flags) && err) goto err; } } /* We have to commit them all together, because if any error happened in * between, we could not handle the failure gracefully. */ tcf_idr_insert_many(actions, init_res); *attr_size = tcf_action_full_attrs_size(sz); err = i - 1; goto err_mod; err: tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND); err_mod: for (i = 0; i < TCA_ACT_MAX_PRIO && ops[i]; i++) module_put(ops[i]->owner); return err; } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw) { if (a->cpu_bstats) { _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), bytes, packets); return; } _bstats_update(&a->tcfa_bstats, bytes, packets); atomic_add(drops, &a->tcfa_drops); if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } EXPORT_SYMBOL(tcf_action_update_stats); int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { struct gnet_stats_queue qstats = {0}; struct gnet_dump d; int err = 0; if (p == NULL) goto errout; /* compat_mode being true specifies a call that is supposed * to add additional backward compatibility statistic TLVs. */ if (compat_mode) { if (p->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, TCA_STATS, TCA_XSTATS, &p->tcfa_lock, &d, TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, &p->tcfa_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; qstats.drops = atomic_read(&p->tcfa_drops); qstats.overlimits = atomic_read(&p->tcfa_overlimits); if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfa_bstats, false) < 0 || gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &qstats, qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) goto errout; return 0; errout: return -1; } static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref, struct netlink_ext_ack *extack) { struct tcamsg *t; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); if (!nlh) goto out_nlmsg_trim; t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; if (extack && extack->_msg && nla_put_string(skb, TCA_ROOT_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) goto out_nlmsg_trim; if (tcf_action_dump(skb, actions, bind, ref, false) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 1, NULL) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } return rtnl_unicast(skb, net, portid); } static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; struct tc_action *a; int index; int err; err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; if (tb[TCA_ACT_INDEX] == NULL || nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) { NL_SET_ERR_MSG(extack, "Invalid TC action index value"); goto err_out; } index = nla_get_u32(tb[TCA_ACT_INDEX]); err = -EINVAL; ops = tc_lookup_action(tb[TCA_ACT_KIND]); if (!ops) { /* could happen in batch of actions */ NL_SET_ERR_MSG(extack, "Specified TC action kind not found"); goto err_out; } err = -ENOENT; if (__tcf_idr_search(net, ops, &a, index) == 0) { NL_SET_ERR_MSG(extack, "TC action with specified index not found"); goto err_mod; } module_put(ops->owner); return a; err_mod: module_put(ops->owner); err_out: return ERR_PTR(err); } static int tca_action_flush(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, struct netlink_ext_ack *extack) { struct sk_buff *skb; unsigned char *b; struct nlmsghdr *nlh; struct tcamsg *t; struct netlink_callback dcb; struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; struct nlattr *kind; int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return err; b = skb_tail_pointer(skb); err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; ops = tc_lookup_action(kind); if (!ops) { /*some idjot trying to flush unknown action */ NL_SET_ERR_MSG(extack, "Cannot flush unknown TC action"); goto err_out; } nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); if (!nlh) { NL_SET_ERR_MSG(extack, "Failed to create TC action flush notification"); goto out_module_put; } t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) { NL_SET_ERR_MSG(extack, "Failed to add new netlink message"); goto out_module_put; } err = __tcf_generic_walker(net, skb, &dcb, RTM_DELACTION, ops, extack); if (err <= 0) { nla_nest_cancel(skb, nest); goto out_module_put; } nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); return err; out_module_put: module_put(ops->owner); err_out: kfree_skb(skb); return err; } static int tcf_action_delete(struct net *net, struct tc_action *actions[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; /* Actions can be deleted concurrently so we must save their * type and id to search again after reference is released. */ struct tcf_idrinfo *idrinfo = a->idrinfo; u32 act_index = a->tcfa_index; actions[i] = NULL; if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); } else { int ret; /* now do the delete */ ret = tcf_idr_delete_index(idrinfo, act_index); if (ret < 0) return ret; } } return 0; } static struct sk_buff *tcf_reoffload_del_notify_msg(struct net *net, struct tc_action *action) { size_t attr_size = tcf_action_fill_size(action); struct tc_action *actions[TCA_ACT_MAX_PRIO] = { [0] = action, }; struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) { kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_reoffload_del_notify(struct net *net, struct tc_action *action) { const struct tc_action_ops *ops = action->ops; struct sk_buff *skb; int ret; if (!rtnl_notify_needed(net, 0, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_reoffload_del_notify_msg(net, action); if (IS_ERR(skb)) return PTR_ERR(skb); } ret = tcf_idr_release_unsafe(action); if (ret == ACT_P_DELETED) { module_put(ops->owner); ret = rtnetlink_maybe_send(skb, net, 0, RTNLGRP_TC, 0); } else { kfree_skb(skb); } return ret; } int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, void *cb_priv, bool add) { struct tc_act_pernet_id *id_ptr; struct tcf_idrinfo *idrinfo; struct tc_action_net *tn; struct tc_action *p; unsigned int act_id; unsigned long tmp; unsigned long id; struct idr *idr; struct net *net; int ret; if (!cb) return -EINVAL; down_read(&net_rwsem); mutex_lock(&act_id_mutex); for_each_net(net) { list_for_each_entry(id_ptr, &act_pernet_id_list, list) { act_id = id_ptr->id; tn = net_generic(net, act_id); if (!tn) continue; idrinfo = tn->idrinfo; if (!idrinfo) continue; mutex_lock(&idrinfo->lock); idr = &idrinfo->action_idr; idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p) || tc_act_bind(p->tcfa_flags)) continue; if (add) { tcf_action_offload_add_ex(p, NULL, cb, cb_priv); continue; } /* cb unregister to update hw count */ ret = tcf_action_offload_del_ex(p, cb, cb_priv); if (ret < 0) continue; if (tc_act_skip_sw(p->tcfa_flags) && !tc_act_in_hw(p)) tcf_reoffload_del_notify(net, p); } mutex_unlock(&idrinfo->lock); } } mutex_unlock(&act_id_mutex); up_read(&net_rwsem); return 0; } static struct sk_buff *tcf_del_notify_msg(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, 0, 2, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; int ret; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_del_notify_msg(net, n, actions, portid, attr_size, extack); if (IS_ERR(skb)) return PTR_ERR(skb); } /* now do the delete */ ret = tcf_action_delete(net, actions); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); return ret; } return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int event, struct netlink_ext_ack *extack) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; ret = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { if (tb[1]) return tca_action_flush(net, tb[1], n, portid, extack); NL_SET_ERR_MSG(extack, "Invalid netlink attributes while flushing TC action"); return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_get_1(net, tb[i], n, portid, extack); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; } attr_size += tcf_action_fill_size(act); actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ ret = tcf_del_notify(net, n, actions, portid, attr_size, extack); if (ret) goto err; return 0; } err: tcf_action_put_many(actions); return ret; } static struct sk_buff *tcf_add_notify_msg(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_NEWACTION, 0, 0, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_add_notify_msg(net, n, actions, portid, attr_size, extack); if (IS_ERR(skb)) return PTR_ERR(skb); } return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, u32 flags, struct netlink_ext_ack *extack) { size_t attr_size = 0; int loop, ret; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; int init_res[TCA_ACT_MAX_PRIO] = {}; for (loop = 0; loop < 10; loop++) { ret = tcf_action_init(net, NULL, nla, NULL, actions, init_res, &attr_size, flags, 0, extack); if (ret != -EAGAIN) break; } if (ret < 0) return ret; ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); /* only put bound actions */ tca_put_bound_many(actions, init_res); return ret; } static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAG_LARGE_DUMP_ON | TCA_ACT_FLAG_TERSE_DUMP), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ROOT_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; u32 flags = 0; int ret = 0; if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL, extack); if (ret < 0) return ret; if (tca[TCA_ACT_TAB] == NULL) { NL_SET_ERR_MSG(extack, "Netlink action attributes missing"); return -EINVAL; } /* n->nlmsg_flags & NLM_F_CREATE */ switch (n->nlmsg_type) { case RTM_NEWACTION: /* we are going to assume all other flags * imply create only if it doesn't exist * Note that CREATE | EXCL implies that * but since we want avoid ambiguity (eg when flags * is zero) then just set this */ if (n->nlmsg_flags & NLM_F_REPLACE) flags = TCA_ACT_FLAGS_REPLACE; ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, flags, extack); break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, portid, RTM_DELACTION, extack); break; case RTM_GETACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, portid, RTM_GETACTION, extack); break; default: BUG(); } return ret; } static struct nlattr *find_dump_kind(struct nlattr **nla) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct nlattr *kind; tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; if (nla_parse_deprecated(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0) return NULL; if (tb[1] == NULL) return NULL; if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; return kind; } static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; struct tc_action_ops *a_o; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *tb[TCA_ROOT_MAX + 1]; struct nlattr *count_attr = NULL; unsigned long jiffy_since = 0; struct nlattr *kind = NULL; struct nla_bitfield32 bf; u32 msecs_since = 0; u32 act_count = 0; ret = nlmsg_parse_deprecated(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, tcaa_policy, cb->extack); if (ret < 0) return ret; kind = find_dump_kind(tb); if (kind == NULL) { pr_info("tc_dump_action: action bad kind\n"); return 0; } a_o = tc_lookup_action(kind); if (a_o == NULL) return 0; cb->args[2] = 0; if (tb[TCA_ROOT_FLAGS]) { bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]); cb->args[2] = bf.value; } if (tb[TCA_ROOT_TIME_DELTA]) { msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]); } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; if (msecs_since) jiffy_since = jiffies - msecs_to_jiffies(msecs_since); t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; cb->args[3] = jiffy_since; count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32)); if (!count_attr) goto out_module_put; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (nest == NULL) goto out_module_put; ret = __tcf_generic_walker(net, skb, cb, RTM_GETACTION, a_o, NULL); if (ret < 0) goto out_module_put; if (ret > 0) { nla_nest_end(skb, nest); ret = skb->len; act_count = cb->args[1]; memcpy(nla_data(count_attr), &act_count, sizeof(u32)); cb->args[1] = 0; } else nlmsg_trim(skb, b); nlh->nlmsg_len = skb_tail_pointer(skb) - b; if (NETLINK_CB(cb->skb).portid && ret) nlh->nlmsg_flags |= NLM_F_MULTI; module_put(a_o->owner); return skb->len; out_module_put: module_put(a_o->owner); nlmsg_trim(skb, b); return skb->len; } static const struct rtnl_msg_handler tc_action_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWACTION, .doit = tc_ctl_action}, {.msgtype = RTM_DELACTION, .doit = tc_ctl_action}, {.msgtype = RTM_GETACTION, .doit = tc_ctl_action, .dumpit = tc_dump_action}, }; static int __init tc_action_init(void) { rtnl_register_many(tc_action_rtnl_msg_handlers); return 0; } subsys_initcall(tc_action_init);
25 20 6 20 1 3 3 2 1 1 10 2 14 8 18 4 19 3 1 1 1 6 8 1 6 1 6 6 1 1 3 3 3 2 7 3 1 6 11 11 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2020, Red Hat, Inc. */ #define pr_fmt(fmt) "MPTCP: " fmt #include "protocol.h" #include "mptcp_pm_gen.h" #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 static const struct genl_multicast_group mptcp_pm_mcgrps[] = { [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME, .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (family == AF_INET6) return MPTCP_PM_ADDR_ATTR_ADDR6; #endif return MPTCP_PM_ADDR_ATTR_ADDR4; } static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], const struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr, bool require_family) { int err, addr_addr; if (!attr) { GENL_SET_ERR_MSG(info, "missing address info"); return -EINVAL; } /* no validation needed - was already done via nested policy */ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, mptcp_pm_address_nl_policy, info->extack); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_ID]) addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) return 0; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); return -EINVAL; } addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); if (addr->family != AF_INET #if IS_ENABLED(CONFIG_MPTCP_IPV6) && addr->family != AF_INET6 #endif ) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "unknown address family"); return -EINVAL; } addr_addr = mptcp_pm_family_to_addr(addr->family); if (!tb[addr_addr]) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing address data"); return -EINVAL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr->family == AF_INET6) addr->addr6 = nla_get_in6_addr(tb[addr_addr]); else #endif addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; memset(addr, 0, sizeof(*addr)); return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true); } int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; int err; memset(entry, 0, sizeof(*entry)); err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { s32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } static int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR); if (!attr) return -EMSGSIZE; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) goto nla_put_failure; if (entry->ifindex && nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) goto nla_put_failure; if (addr->family == AF_INET && nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, addr->addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6 && nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6)) goto nla_put_failure; #endif nla_nest_end(skb, attr); return 0; nla_put_failure: nla_nest_cancel(skb, attr); return -EMSGSIZE; } static int mptcp_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info) { if (info->attrs[MPTCP_PM_ATTR_TOKEN]) return mptcp_userspace_pm_get_addr(id, addr, info); return mptcp_pm_nl_get_addr(id, addr, info); } int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct mptcp_pm_addr_entry addr; struct nlattr *attr; struct sk_buff *msg; void *reply; int ret; if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) return -EINVAL; attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, info->genlhdr->cmd); if (!reply) { GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); ret = -EMSGSIZE; goto fail; } ret = mptcp_pm_get_addr(addr.addr.id, &addr, info); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); goto fail; } ret = mptcp_nl_fill_addr(msg, &addr); if (ret) goto fail; genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); return ret; fail: nlmsg_free(msg); return ret; } int mptcp_pm_genl_fill_addr(struct sk_buff *msg, struct netlink_callback *cb, struct mptcp_pm_addr_entry *entry) { void *hdr; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &mptcp_genl_family, NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); if (!hdr) return -EINVAL; if (mptcp_nl_fill_addr(msg, entry) < 0) { genlmsg_cancel(msg, hdr); return -EINVAL; } genlmsg_end(msg, hdr); return 0; } static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); if (info->attrs[MPTCP_PM_ATTR_TOKEN]) return mptcp_userspace_pm_dump_addr(msg, cb); return mptcp_pm_nl_dump_addr(msg, cb); } int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { return mptcp_pm_dump_addr(msg, cb); } static int mptcp_pm_set_flags(struct genl_info *info) { struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr_loc; int ret = -EINVAL; if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) return ret; attr_loc = info->attrs[MPTCP_PM_ATTR_ADDR]; ret = mptcp_pm_parse_entry(attr_loc, info, false, &loc); if (ret < 0) return ret; if (info->attrs[MPTCP_PM_ATTR_TOKEN]) return mptcp_userspace_pm_set_flags(&loc, info); return mptcp_pm_nl_set_flags(&loc, info); } int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) { return mptcp_pm_set_flags(info); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) { genlmsg_multicast_netns(&mptcp_genl_family, net, nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); } bool mptcp_userspace_pm_active(const struct mptcp_sock *msk) { return genl_has_listeners(&mptcp_genl_family, sock_net((const struct sock *)msk), MPTCP_PM_EV_GRP_OFFSET); } static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) { const struct inet_sock *issk = inet_sk(ssk); const struct mptcp_subflow_context *sf; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) return -EMSGSIZE; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) return -EMSGSIZE; if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr)) return -EMSGSIZE; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; break; } #endif default: WARN_ON_ONCE(1); return -EMSGSIZE; } if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) return -EMSGSIZE; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, subflow_get_local_id(sf))) return -EMSGSIZE; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) return -EMSGSIZE; return 0; } static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct sock *sk = (const struct sock *)msk; const struct mptcp_subflow_context *sf; u8 sk_err; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) return -EMSGSIZE; if (mptcp_event_add_subflow(skb, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup)) return -EMSGSIZE; if (ssk->sk_bound_dev_if && nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) return -EMSGSIZE; sk_err = READ_ONCE(ssk->sk_err); if (sk_err && sk->sk_state == TCP_ESTABLISHED && nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) return -EMSGSIZE; return 0; } static int mptcp_event_sub_established(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { return mptcp_event_put_token_and_ssk(skb, msk, ssk); } static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct mptcp_subflow_context *sf; if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (!sf->reset_seen) return 0; if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) return -EMSGSIZE; if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) return -EMSGSIZE; return 0; } static int mptcp_event_created(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); u16 flags = 0; if (err) return err; if (READ_ONCE(msk->pm.server_side)) { flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; /* Deprecated, and only set when it is the server side */ if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) return -EMSGSIZE; } if (READ_ONCE(msk->pm.remote_deny_join_id0)) flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) return -EMSGSIZE; return mptcp_event_add_subflow(skb, ssk); } void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) goto nla_put_failure; genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_ANNOUNCED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port == 0 ? inet_sk(ssk)->inet_dport : info->port)) goto nla_put_failure; switch (info->family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6)) goto nla_put_failure; break; #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event) { const struct inet_sock *issk = inet_sk(ssk); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event); if (!nlh) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) goto nla_put_failure; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) goto nla_put_failure; break; } #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_KERNEL); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type); if (!nlh) goto nla_put_failure; switch (type) { case MPTCP_EVENT_UNSPEC: WARN_ON_ONCE(1); break; case MPTCP_EVENT_CREATED: case MPTCP_EVENT_ESTABLISHED: if (mptcp_event_created(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_CLOSED: if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0) goto nla_put_failure; break; case MPTCP_EVENT_ANNOUNCED: case MPTCP_EVENT_REMOVED: /* call mptcp_event_addr_announced()/removed instead */ WARN_ON_ONCE(1); break; case MPTCP_EVENT_SUB_ESTABLISHED: case MPTCP_EVENT_SUB_PRIORITY: if (mptcp_event_sub_established(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_SUB_CLOSED: if (mptcp_event_sub_closed(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_LISTENER_CREATED: case MPTCP_EVENT_LISTENER_CLOSED: break; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, gfp); return; nla_put_failure: nlmsg_free(skb); } struct genl_family mptcp_genl_family __ro_after_init = { .name = MPTCP_PM_NAME, .version = MPTCP_PM_VER, .netnsok = true, .module = THIS_MODULE, .ops = mptcp_pm_nl_ops, .n_ops = ARRAY_SIZE(mptcp_pm_nl_ops), .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1, .mcgrps = mptcp_pm_mcgrps, .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; void __init mptcp_pm_nl_init(void) { if (genl_register_family(&mptcp_genl_family)) panic("Failed to register MPTCP PM netlink family\n"); }
465 5434 5434 1185 1601 27 2496 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H #include <linux/capability.h> #include <linux/init.h> #include <linux/key.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/uidgid.h> #include <linux/sched.h> #include <linux/sched/user.h> struct cred; struct inode; /* * COW Supplementary groups list */ struct group_info { refcount_t usage; int ngroups; kgid_t gid[]; } __randomize_layout; /** * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. */ static inline struct group_info *get_group_info(struct group_info *gi) { refcount_inc(&gi->usage); return gi; } /** * put_group_info - Release a reference to a group info structure * @group_info: The group info to release */ #define put_group_info(group_info) \ do { \ if (refcount_dec_and_test(&(group_info)->usage)) \ groups_free(group_info); \ } while (0) #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); extern int groups_search(const struct group_info *, kgid_t); extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern bool may_setgroups(void); extern void groups_sort(struct group_info *); #else static inline void groups_free(struct group_info *group_info) { } static inline int in_group_p(kgid_t grp) { return 1; } static inline int in_egroup_p(kgid_t grp) { return 1; } static inline int groups_search(const struct group_info *group_info, kgid_t grp) { return 1; } #endif /* * The security context of a task * * The parts of the context break down into two categories: * * (1) The objective context of a task. These parts are used when some other * task is attempting to affect this one. * * (2) The subjective context. These details are used when the task is acting * upon another object, be that a file, a task, a key or whatever. * * Note that some members of this structure belong to both categories - the * LSM security pointer for instance. * * A task has two security pointers. task->real_cred points to the objective * context that defines that task's actual details. The objective part of this * context is used whenever that task is acted upon. * * task->cred points to the subjective context that defines the details of how * that task is going to act upon another object. This may be overridden * temporarily to point to another security context, but normally points to the * same context as task->real_cred. */ struct cred { atomic_long_t usage; kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ kgid_t sgid; /* saved GID of the task */ kuid_t euid; /* effective UID of the task */ kgid_t egid; /* effective GID of the task */ kuid_t fsuid; /* UID for VFS ops */ kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ #endif #ifdef CONFIG_SECURITY void *security; /* LSM security */ #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct ucounts *ucounts; struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { int non_rcu; /* Can we skip RCU deletion? */ struct rcu_head rcu; /* RCU deletion hook */ }; } __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, u64); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); extern int set_cred_ucounts(struct cred *); static inline bool cap_ambient_invariant_ok(const struct cred *cred) { return cap_issubset(cred->cap_ambient, cap_intersect(cred->cap_permitted, cred->cap_inheritable)); } static inline const struct cred *override_creds(const struct cred *override_cred) { return rcu_replace_pointer(current->cred, override_cred, 1); } static inline const struct cred *revert_creds(const struct cred *revert_cred) { return rcu_replace_pointer(current->cred, revert_cred, 1); } /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference * @nr: Number of references to acquire * * Get references on the specified set of credentials. The caller must release * all acquired reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. Although the * pointer is const, this will temporarily discard the const and increment the * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. */ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return cred; nonconst_cred->non_rcu = 0; atomic_long_add(nr, &nonconst_cred->usage); return cred; } /* * get_cred - Get a reference on a set of credentials * @cred: The credentials to reference * * Get a reference on the specified set of credentials. The caller must * release the reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. */ static inline const struct cred *get_cred(const struct cred *cred) { return get_cred_many(cred, 1); } static inline const struct cred *get_cred_rcu(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return NULL; if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; nonconst_cred->non_rcu = 0; return cred; } /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * @nr: Number of references to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. * * This takes a const pointer to a set of credentials because the credentials * on task_struct are attached by const pointers to prevent accidental * alteration of otherwise immutable credential sets. */ static inline void put_cred_many(const struct cred *_cred, int nr) { struct cred *cred = (struct cred *) _cred; if (cred) { if (atomic_long_sub_and_test(nr, &cred->usage)) __put_cred(cred); } } /* * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. */ static inline void put_cred(const struct cred *cred) { put_cred_many(cred, 1); } DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T)) /** * current_cred - Access the current task's subjective credentials * * Access the subjective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_cred() \ rcu_dereference_protected(current->cred, 1) /** * current_real_cred - Access the current task's objective credentials * * Access the objective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_real_cred() \ rcu_dereference_protected(current->real_cred, 1) /** * __task_cred - Access a task's objective credentials * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU * readlock. * * The result of this function should not be passed directly to get_cred(); * rather get_task_cred() should be used instead. */ #define __task_cred(task) \ rcu_dereference((task)->real_cred) /** * get_current_cred - Get the current task's subjective credentials * * Get the subjective credentials of the current task, pinning them so that * they can't go away. Accessing the current task's credentials directly is * not permitted. */ #define get_current_cred() \ (get_cred(current_cred())) /** * get_current_user - Get the current task's user_struct * * Get the user record of the current task, pinning it so that it can't go * away. */ #define get_current_user() \ ({ \ struct user_struct *__u; \ const struct cred *__cred; \ __cred = current_cred(); \ __u = get_uid(__cred->user); \ __u; \ }) /** * get_current_groups - Get the current task's supplementary group list * * Get the supplementary group list of the current task, pinning it so that it * can't go away. */ #define get_current_groups() \ ({ \ struct group_info *__groups; \ const struct cred *__cred; \ __cred = current_cred(); \ __groups = get_group_info(__cred->group_info); \ __groups; \ }) #define task_cred_xxx(task, xxx) \ ({ \ __typeof__(((struct cred *)NULL)->xxx) ___val; \ rcu_read_lock(); \ ___val = __task_cred((task))->xxx; \ rcu_read_unlock(); \ ___val; \ }) #define task_uid(task) (task_cred_xxx((task), uid)) #define task_euid(task) (task_cred_xxx((task), euid)) #define task_ucounts(task) (task_cred_xxx((task), ucounts)) #define current_cred_xxx(xxx) \ ({ \ current_cred()->xxx; \ }) #define current_uid() (current_cred_xxx(uid)) #define current_gid() (current_cred_xxx(gid)) #define current_euid() (current_cred_xxx(euid)) #define current_egid() (current_cred_xxx(egid)) #define current_suid() (current_cred_xxx(suid)) #define current_sgid() (current_cred_xxx(sgid)) #define current_fsuid() (current_cred_xxx(fsuid)) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) #define current_ucounts() (current_cred_xxx(ucounts)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) #else static inline struct user_namespace *current_user_ns(void) { return &init_user_ns; } #endif #define current_uid_gid(_uid, _gid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_uid) = __cred->uid; \ *(_gid) = __cred->gid; \ } while(0) #define current_euid_egid(_euid, _egid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_euid) = __cred->euid; \ *(_egid) = __cred->egid; \ } while(0) #define current_fsuid_fsgid(_fsuid, _fsgid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_fsuid) = __cred->fsuid; \ *(_fsgid) = __cred->fsgid; \ } while(0) #endif /* _LINUX_CRED_H */
41 35 35 35 7 7 30 28 28 19 19 6 19 13 19 19 26 26 13 6 19 19 12 12 6 6 6 1 2 3 6 5 5 2 2 1 3 1 2 1 2 2 2 1 4 6 5 4 2 3 2 1 4 48 48 48 44 20 20 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 /* * Copyright (c) 2014, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "bearer.h" #include "link.h" #include "name_table.h" #include "socket.h" #include "node.h" #include "net.h" #include <net/genetlink.h> #include <linux/string_helpers.h> #include <linux/tipc_config.h> /* The legacy API had an artificial message length limit called * ULTRA_STRING_MAX_LEN. */ #define ULTRA_STRING_MAX_LEN 32768 #define TIPC_SKB_MAX TLV_SPACE(ULTRA_STRING_MAX_LEN) #define REPLY_TRUNCATED "<truncated>\n" struct tipc_nl_compat_msg { u16 cmd; int rep_type; int rep_size; int req_type; int req_size; struct net *net; struct sk_buff *rep; struct tlv_desc *req; struct sock *dst_sk; }; struct tipc_nl_compat_cmd_dump { int (*header)(struct tipc_nl_compat_msg *); int (*dumpit)(struct sk_buff *, struct netlink_callback *); int (*format)(struct tipc_nl_compat_msg *msg, struct nlattr **attrs); }; struct tipc_nl_compat_cmd_doit { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*transcode)(struct tipc_nl_compat_cmd_doit *cmd, struct sk_buff *skb, struct tipc_nl_compat_msg *msg); }; static int tipc_skb_tailroom(struct sk_buff *skb) { int tailroom; int limit; tailroom = skb_tailroom(skb); limit = TIPC_SKB_MAX - skb->len; if (tailroom < limit) return tailroom; return limit; } static inline int TLV_GET_DATA_LEN(struct tlv_desc *tlv) { return TLV_GET_LEN(tlv) - TLV_SPACE(0); } static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len) { struct tlv_desc *tlv = (struct tlv_desc *)skb_tail_pointer(skb); if (tipc_skb_tailroom(skb) < TLV_SPACE(len)) return -EMSGSIZE; skb_put(skb, TLV_SPACE(len)); memset(tlv, 0, TLV_SPACE(len)); tlv->tlv_type = htons(type); tlv->tlv_len = htons(TLV_LENGTH(len)); if (len && data) memcpy(TLV_DATA(tlv), data, len); return 0; } static void tipc_tlv_init(struct sk_buff *skb, u16 type) { struct tlv_desc *tlv = (struct tlv_desc *)skb->data; TLV_SET_LEN(tlv, 0); TLV_SET_TYPE(tlv, type); skb_put(skb, sizeof(struct tlv_desc)); } static __printf(2, 3) int tipc_tlv_sprintf(struct sk_buff *skb, const char *fmt, ...) { int n; u16 len; u32 rem; char *buf; struct tlv_desc *tlv; va_list args; rem = tipc_skb_tailroom(skb); tlv = (struct tlv_desc *)skb->data; len = TLV_GET_LEN(tlv); buf = TLV_DATA(tlv) + len; va_start(args, fmt); n = vscnprintf(buf, rem, fmt, args); va_end(args); TLV_SET_LEN(tlv, n + len); skb_put(skb, n); return n; } static struct sk_buff *tipc_tlv_alloc(int size) { int hdr_len; struct sk_buff *buf; size = TLV_SPACE(size); hdr_len = nlmsg_total_size(GENL_HDRLEN + TIPC_GENL_HDRLEN); buf = alloc_skb(hdr_len + size, GFP_KERNEL); if (!buf) return NULL; skb_reserve(buf, hdr_len); return buf; } static struct sk_buff *tipc_get_err_tlv(char *str) { int str_len = strlen(str) + 1; struct sk_buff *buf; buf = tipc_tlv_alloc(str_len); if (buf) tipc_add_tlv(buf, TIPC_TLV_ERROR_STRING, str, str_len); return buf; } static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg, struct sk_buff *arg) { struct genl_dumpit_info info; int len = 0; int err; struct sk_buff *buf; struct nlmsghdr *nlmsg; struct netlink_callback cb; struct nlattr **attrbuf; memset(&cb, 0, sizeof(cb)); cb.nlh = (struct nlmsghdr *)arg->data; cb.skb = arg; cb.data = &info; buf = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!buf) return -ENOMEM; buf->sk = msg->dst_sk; if (__tipc_dump_start(&cb, msg->net)) { kfree_skb(buf); return -ENOMEM; } attrbuf = kcalloc(tipc_genl_family.maxattr + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) { err = -ENOMEM; goto err_out; } info.info.attrs = attrbuf; if (nlmsg_len(cb.nlh) > 0) { err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, tipc_genl_family.maxattr, tipc_genl_family.policy, NULL); if (err) goto err_out; } do { int rem; len = (*cmd->dumpit)(buf, &cb); nlmsg_for_each_msg(nlmsg, nlmsg_hdr(buf), len, rem) { err = nlmsg_parse_deprecated(nlmsg, GENL_HDRLEN, attrbuf, tipc_genl_family.maxattr, tipc_genl_family.policy, NULL); if (err) goto err_out; err = (*cmd->format)(msg, attrbuf); if (err) goto err_out; if (tipc_skb_tailroom(msg->rep) <= 1) { err = -EMSGSIZE; goto err_out; } } skb_reset_tail_pointer(buf); buf->len = 0; } while (len); err = 0; err_out: kfree(attrbuf); tipc_dump_done(&cb); kfree_skb(buf); if (err == -EMSGSIZE) { /* The legacy API only considered messages filling * "ULTRA_STRING_MAX_LEN" to be truncated. */ if ((TIPC_SKB_MAX - msg->rep->len) <= 1) { char *tail = skb_tail_pointer(msg->rep); if (*tail != '\0') sprintf(tail - sizeof(REPLY_TRUNCATED) - 1, REPLY_TRUNCATED); } return 0; } return err; } static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg) { struct nlmsghdr *nlh; struct sk_buff *arg; int err; if (msg->req_type && (!msg->req_size || !TLV_CHECK_TYPE(msg->req, msg->req_type))) return -EINVAL; msg->rep = tipc_tlv_alloc(msg->rep_size); if (!msg->rep) return -ENOMEM; if (msg->rep_type) tipc_tlv_init(msg->rep, msg->rep_type); if (cmd->header) { err = (*cmd->header)(msg); if (err) { kfree_skb(msg->rep); msg->rep = NULL; return err; } } arg = nlmsg_new(0, GFP_KERNEL); if (!arg) { kfree_skb(msg->rep); msg->rep = NULL; return -ENOMEM; } nlh = nlmsg_put(arg, 0, 0, tipc_genl_family.id, 0, NLM_F_MULTI); if (!nlh) { kfree_skb(arg); kfree_skb(msg->rep); msg->rep = NULL; return -EMSGSIZE; } nlmsg_end(arg, nlh); err = __tipc_nl_compat_dumpit(cmd, msg, arg); if (err) { kfree_skb(msg->rep); msg->rep = NULL; } kfree_skb(arg); return err; } static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, struct tipc_nl_compat_msg *msg) { int err; struct sk_buff *doit_buf; struct sk_buff *trans_buf; struct nlattr **attrbuf; struct genl_info info; trans_buf = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!trans_buf) return -ENOMEM; attrbuf = kmalloc_array(tipc_genl_family.maxattr + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) { err = -ENOMEM; goto trans_out; } doit_buf = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!doit_buf) { err = -ENOMEM; goto attrbuf_out; } memset(&info, 0, sizeof(info)); info.attrs = attrbuf; rtnl_lock(); err = (*cmd->transcode)(cmd, trans_buf, msg); if (err) goto doit_out; err = nla_parse_deprecated(attrbuf, tipc_genl_family.maxattr, (const struct nlattr *)trans_buf->data, trans_buf->len, NULL, NULL); if (err) goto doit_out; doit_buf->sk = msg->dst_sk; err = (*cmd->doit)(doit_buf, &info); doit_out: rtnl_unlock(); kfree_skb(doit_buf); attrbuf_out: kfree(attrbuf); trans_out: kfree_skb(trans_buf); return err; } static int tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, struct tipc_nl_compat_msg *msg) { int err; if (msg->req_type && (!msg->req_size || !TLV_CHECK_TYPE(msg->req, msg->req_type))) return -EINVAL; err = __tipc_nl_compat_doit(cmd, msg); if (err) return err; /* The legacy API considered an empty message a success message */ msg->rep = tipc_tlv_alloc(0); if (!msg->rep) return -ENOMEM; return 0; } static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { struct nlattr *bearer[TIPC_NLA_BEARER_MAX + 1]; int err; if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(bearer, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER], NULL, NULL); if (err) return err; return tipc_add_tlv(msg->rep, TIPC_TLV_BEARER_NAME, nla_data(bearer[TIPC_NLA_BEARER_NAME]), nla_len(bearer[TIPC_NLA_BEARER_NAME])); } static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { struct nlattr *prop; struct nlattr *bearer; struct tipc_bearer_config *b; int len; b = (struct tipc_bearer_config *)TLV_DATA(msg->req); bearer = nla_nest_start_noflag(skb, TIPC_NLA_BEARER); if (!bearer) return -EMSGSIZE; len = TLV_GET_DATA_LEN(msg->req); len -= offsetof(struct tipc_bearer_config, name); if (len <= 0) return -EINVAL; len = min_t(int, len, TIPC_MAX_BEARER_NAME); if (!string_is_terminated(b->name, len)) return -EINVAL; if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, b->name)) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_BEARER_DOMAIN, ntohl(b->disc_domain))) return -EMSGSIZE; if (ntohl(b->priority) <= TIPC_MAX_LINK_PRI) { prop = nla_nest_start_noflag(skb, TIPC_NLA_BEARER_PROP); if (!prop) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(b->priority))) return -EMSGSIZE; nla_nest_end(skb, prop); } nla_nest_end(skb, bearer); return 0; } static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd, struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { char *name; struct nlattr *bearer; int len; name = (char *)TLV_DATA(msg->req); bearer = nla_nest_start_noflag(skb, TIPC_NLA_BEARER); if (!bearer) return -EMSGSIZE; len = TLV_GET_DATA_LEN(msg->req); if (len <= 0) return -EINVAL; len = min_t(int, len, TIPC_MAX_BEARER_NAME); if (!string_is_terminated(name, len)) return -EINVAL; if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, name)) return -EMSGSIZE; nla_nest_end(skb, bearer); return 0; } static inline u32 perc(u32 count, u32 total) { return (count * 100 + (total / 2)) / total; } static void __fill_bc_link_stat(struct tipc_nl_compat_msg *msg, struct nlattr *prop[], struct nlattr *stats[]) { tipc_tlv_sprintf(msg->rep, " Window:%u packets\n", nla_get_u32(prop[TIPC_NLA_PROP_WIN])); tipc_tlv_sprintf(msg->rep, " RX packets:%u fragments:%u/%u bundles:%u/%u\n", nla_get_u32(stats[TIPC_NLA_STATS_RX_INFO]), nla_get_u32(stats[TIPC_NLA_STATS_RX_FRAGMENTS]), nla_get_u32(stats[TIPC_NLA_STATS_RX_FRAGMENTED]), nla_get_u32(stats[TIPC_NLA_STATS_RX_BUNDLES]), nla_get_u32(stats[TIPC_NLA_STATS_RX_BUNDLED])); tipc_tlv_sprintf(msg->rep, " TX packets:%u fragments:%u/%u bundles:%u/%u\n", nla_get_u32(stats[TIPC_NLA_STATS_TX_INFO]), nla_get_u32(stats[TIPC_NLA_STATS_TX_FRAGMENTS]), nla_get_u32(stats[TIPC_NLA_STATS_TX_FRAGMENTED]), nla_get_u32(stats[TIPC_NLA_STATS_TX_BUNDLES]), nla_get_u32(stats[TIPC_NLA_STATS_TX_BUNDLED])); tipc_tlv_sprintf(msg->rep, " RX naks:%u defs:%u dups:%u\n", nla_get_u32(stats[TIPC_NLA_STATS_RX_NACKS]), nla_get_u32(stats[TIPC_NLA_STATS_RX_DEFERRED]), nla_get_u32(stats[TIPC_NLA_STATS_DUPLICATES])); tipc_tlv_sprintf(msg->rep, " TX naks:%u acks:%u dups:%u\n", nla_get_u32(stats[TIPC_NLA_STATS_TX_NACKS]), nla_get_u32(stats[TIPC_NLA_STATS_TX_ACKS]), nla_get_u32(stats[TIPC_NLA_STATS_RETRANSMITTED])); tipc_tlv_sprintf(msg->rep, " Congestion link:%u Send queue max:%u avg:%u", nla_get_u32(stats[TIPC_NLA_STATS_LINK_CONGS]), nla_get_u32(stats[TIPC_NLA_STATS_MAX_QUEUE]), nla_get_u32(stats[TIPC_NLA_STATS_AVG_QUEUE])); } static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { char *name; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct nlattr *prop[TIPC_NLA_PROP_MAX + 1]; struct nlattr *stats[TIPC_NLA_STATS_MAX + 1]; int err; int len; if (!attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL, NULL); if (err) return err; if (!link[TIPC_NLA_LINK_PROP]) return -EINVAL; err = nla_parse_nested_deprecated(prop, TIPC_NLA_PROP_MAX, link[TIPC_NLA_LINK_PROP], NULL, NULL); if (err) return err; if (!link[TIPC_NLA_LINK_STATS]) return -EINVAL; err = nla_parse_nested_deprecated(stats, TIPC_NLA_STATS_MAX, link[TIPC_NLA_LINK_STATS], NULL, NULL); if (err) return err; name = (char *)TLV_DATA(msg->req); len = TLV_GET_DATA_LEN(msg->req); if (len <= 0) return -EINVAL; len = min_t(int, len, TIPC_MAX_LINK_NAME); if (!string_is_terminated(name, len)) return -EINVAL; if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0) return 0; tipc_tlv_sprintf(msg->rep, "\nLink <%s>\n", (char *)nla_data(link[TIPC_NLA_LINK_NAME])); if (link[TIPC_NLA_LINK_BROADCAST]) { __fill_bc_link_stat(msg, prop, stats); return 0; } if (link[TIPC_NLA_LINK_ACTIVE]) tipc_tlv_sprintf(msg->rep, " ACTIVE"); else if (link[TIPC_NLA_LINK_UP]) tipc_tlv_sprintf(msg->rep, " STANDBY"); else tipc_tlv_sprintf(msg->rep, " DEFUNCT"); tipc_tlv_sprintf(msg->rep, " MTU:%u Priority:%u", nla_get_u32(link[TIPC_NLA_LINK_MTU]), nla_get_u32(prop[TIPC_NLA_PROP_PRIO])); tipc_tlv_sprintf(msg->rep, " Tolerance:%u ms Window:%u packets\n", nla_get_u32(prop[TIPC_NLA_PROP_TOL]), nla_get_u32(prop[TIPC_NLA_PROP_WIN])); tipc_tlv_sprintf(msg->rep, " RX packets:%u fragments:%u/%u bundles:%u/%u\n", nla_get_u32(link[TIPC_NLA_LINK_RX]) - nla_get_u32(stats[TIPC_NLA_STATS_RX_INFO]), nla_get_u32(stats[TIPC_NLA_STATS_RX_FRAGMENTS]), nla_get_u32(stats[TIPC_NLA_STATS_RX_FRAGMENTED]), nla_get_u32(stats[TIPC_NLA_STATS_RX_BUNDLES]), nla_get_u32(stats[TIPC_NLA_STATS_RX_BUNDLED])); tipc_tlv_sprintf(msg->rep, " TX packets:%u fragments:%u/%u bundles:%u/%u\n", nla_get_u32(link[TIPC_NLA_LINK_TX]) - nla_get_u32(stats[TIPC_NLA_STATS_TX_INFO]), nla_get_u32(stats[TIPC_NLA_STATS_TX_FRAGMENTS]), nla_get_u32(stats[TIPC_NLA_STATS_TX_FRAGMENTED]), nla_get_u32(stats[TIPC_NLA_STATS_TX_BUNDLES]), nla_get_u32(stats[TIPC_NLA_STATS_TX_BUNDLED])); tipc_tlv_sprintf(msg->rep, " TX profile sample:%u packets average:%u octets\n", nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_CNT]), nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_TOT]) / nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])); tipc_tlv_sprintf(msg->rep, " 0-64:%u%% -256:%u%% -1024:%u%% -4096:%u%% ", perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P0]), nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])), perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P1]), nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])), perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P2]), nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])), perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P3]), nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT]))); tipc_tlv_sprintf(msg->rep, "-16384:%u%% -32768:%u%% -66000:%u%%\n", perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P4]), nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])), perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P5]), nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])), perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P6]), nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT]))); tipc_tlv_sprintf(msg->rep, " RX states:%u probes:%u naks:%u defs:%u dups:%u\n", nla_get_u32(stats[TIPC_NLA_STATS_RX_STATES]), nla_get_u32(stats[TIPC_NLA_STATS_RX_PROBES]), nla_get_u32(stats[TIPC_NLA_STATS_RX_NACKS]), nla_get_u32(stats[TIPC_NLA_STATS_RX_DEFERRED]), nla_get_u32(stats[TIPC_NLA_STATS_DUPLICATES])); tipc_tlv_sprintf(msg->rep, " TX states:%u probes:%u naks:%u acks:%u dups:%u\n", nla_get_u32(stats[TIPC_NLA_STATS_TX_STATES]), nla_get_u32(stats[TIPC_NLA_STATS_TX_PROBES]), nla_get_u32(stats[TIPC_NLA_STATS_TX_NACKS]), nla_get_u32(stats[TIPC_NLA_STATS_TX_ACKS]), nla_get_u32(stats[TIPC_NLA_STATS_RETRANSMITTED])); tipc_tlv_sprintf(msg->rep, " Congestion link:%u Send queue max:%u avg:%u", nla_get_u32(stats[TIPC_NLA_STATS_LINK_CONGS]), nla_get_u32(stats[TIPC_NLA_STATS_MAX_QUEUE]), nla_get_u32(stats[TIPC_NLA_STATS_AVG_QUEUE])); return 0; } static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_link_info link_info; int err; if (!attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL, NULL); if (err) return err; link_info.dest = htonl(nla_get_flag(link[TIPC_NLA_LINK_DEST])); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); nla_strscpy(link_info.str, link[TIPC_NLA_LINK_NAME], TIPC_MAX_LINK_NAME); return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); } static int __tipc_add_link_prop(struct sk_buff *skb, struct tipc_nl_compat_msg *msg, struct tipc_link_config *lc) { switch (msg->cmd) { case TIPC_CMD_SET_LINK_PRI: return nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value)); case TIPC_CMD_SET_LINK_TOL: return nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value)); case TIPC_CMD_SET_LINK_WINDOW: return nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value)); } return -EINVAL; } static int tipc_nl_compat_media_set(struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { struct nlattr *prop; struct nlattr *media; struct tipc_link_config *lc; lc = (struct tipc_link_config *)TLV_DATA(msg->req); media = nla_nest_start_noflag(skb, TIPC_NLA_MEDIA); if (!media) return -EMSGSIZE; if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) return -EMSGSIZE; prop = nla_nest_start_noflag(skb, TIPC_NLA_MEDIA_PROP); if (!prop) return -EMSGSIZE; __tipc_add_link_prop(skb, msg, lc); nla_nest_end(skb, prop); nla_nest_end(skb, media); return 0; } static int tipc_nl_compat_bearer_set(struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { struct nlattr *prop; struct nlattr *bearer; struct tipc_link_config *lc; lc = (struct tipc_link_config *)TLV_DATA(msg->req); bearer = nla_nest_start_noflag(skb, TIPC_NLA_BEARER); if (!bearer) return -EMSGSIZE; if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) return -EMSGSIZE; prop = nla_nest_start_noflag(skb, TIPC_NLA_BEARER_PROP); if (!prop) return -EMSGSIZE; __tipc_add_link_prop(skb, msg, lc); nla_nest_end(skb, prop); nla_nest_end(skb, bearer); return 0; } static int __tipc_nl_compat_link_set(struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { struct nlattr *prop; struct nlattr *link; struct tipc_link_config *lc; lc = (struct tipc_link_config *)TLV_DATA(msg->req); link = nla_nest_start_noflag(skb, TIPC_NLA_LINK); if (!link) return -EMSGSIZE; if (nla_put_string(skb, TIPC_NLA_LINK_NAME, lc->name)) return -EMSGSIZE; prop = nla_nest_start_noflag(skb, TIPC_NLA_LINK_PROP); if (!prop) return -EMSGSIZE; __tipc_add_link_prop(skb, msg, lc); nla_nest_end(skb, prop); nla_nest_end(skb, link); return 0; } static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd, struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { struct tipc_link_config *lc; struct tipc_bearer *bearer; struct tipc_media *media; int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); len = TLV_GET_DATA_LEN(msg->req); len -= offsetof(struct tipc_link_config, name); if (len <= 0) return -EINVAL; len = min_t(int, len, TIPC_MAX_LINK_NAME); if (!string_is_terminated(lc->name, len)) return -EINVAL; media = tipc_media_find(lc->name); if (media) { cmd->doit = &__tipc_nl_media_set; return tipc_nl_compat_media_set(skb, msg); } bearer = tipc_bearer_find(msg->net, lc->name); if (bearer) { cmd->doit = &__tipc_nl_bearer_set; return tipc_nl_compat_bearer_set(skb, msg); } return __tipc_nl_compat_link_set(skb, msg); } static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { char *name; struct nlattr *link; int len; name = (char *)TLV_DATA(msg->req); link = nla_nest_start_noflag(skb, TIPC_NLA_LINK); if (!link) return -EMSGSIZE; len = TLV_GET_DATA_LEN(msg->req); if (len <= 0) return -EINVAL; len = min_t(int, len, TIPC_MAX_LINK_NAME); if (!string_is_terminated(name, len)) return -EINVAL; if (nla_put_string(skb, TIPC_NLA_LINK_NAME, name)) return -EMSGSIZE; nla_nest_end(skb, link); return 0; } static int tipc_nl_compat_name_table_dump_header(struct tipc_nl_compat_msg *msg) { int i; u32 depth; struct tipc_name_table_query *ntq; static const char * const header[] = { "Type ", "Lower Upper ", "Port Identity ", "Publication Scope" }; ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req); if (TLV_GET_DATA_LEN(msg->req) < (int)sizeof(struct tipc_name_table_query)) return -EINVAL; depth = ntohl(ntq->depth); if (depth > 4) depth = 4; for (i = 0; i < depth; i++) tipc_tlv_sprintf(msg->rep, header[i]); tipc_tlv_sprintf(msg->rep, "\n"); return 0; } static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { char port_str[27]; struct tipc_name_table_query *ntq; struct nlattr *nt[TIPC_NLA_NAME_TABLE_MAX + 1]; struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1]; u32 node, depth, type, lowbound, upbound; static const char * const scope_str[] = {"", " zone", " cluster", " node"}; int err; if (!attrs[TIPC_NLA_NAME_TABLE]) return -EINVAL; err = nla_parse_nested_deprecated(nt, TIPC_NLA_NAME_TABLE_MAX, attrs[TIPC_NLA_NAME_TABLE], NULL, NULL); if (err) return err; if (!nt[TIPC_NLA_NAME_TABLE_PUBL]) return -EINVAL; err = nla_parse_nested_deprecated(publ, TIPC_NLA_PUBL_MAX, nt[TIPC_NLA_NAME_TABLE_PUBL], NULL, NULL); if (err) return err; ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req); depth = ntohl(ntq->depth); type = ntohl(ntq->type); lowbound = ntohl(ntq->lowbound); upbound = ntohl(ntq->upbound); if (!(depth & TIPC_NTQ_ALLTYPES) && (type != nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]))) return 0; if (lowbound && (lowbound > nla_get_u32(publ[TIPC_NLA_PUBL_UPPER]))) return 0; if (upbound && (upbound < nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]))) return 0; tipc_tlv_sprintf(msg->rep, "%-10u ", nla_get_u32(publ[TIPC_NLA_PUBL_TYPE])); if (depth == 1) goto out; tipc_tlv_sprintf(msg->rep, "%-10u %-10u ", nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]), nla_get_u32(publ[TIPC_NLA_PUBL_UPPER])); if (depth == 2) goto out; node = nla_get_u32(publ[TIPC_NLA_PUBL_NODE]); sprintf(port_str, "<%u.%u.%u:%u>", tipc_zone(node), tipc_cluster(node), tipc_node(node), nla_get_u32(publ[TIPC_NLA_PUBL_REF])); tipc_tlv_sprintf(msg->rep, "%-26s ", port_str); if (depth == 3) goto out; tipc_tlv_sprintf(msg->rep, "%-10u %s", nla_get_u32(publ[TIPC_NLA_PUBL_KEY]), scope_str[nla_get_u32(publ[TIPC_NLA_PUBL_SCOPE])]); out: tipc_tlv_sprintf(msg->rep, "\n"); return 0; } static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { u32 type, lower, upper; struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1]; int err; if (!attrs[TIPC_NLA_PUBL]) return -EINVAL; err = nla_parse_nested_deprecated(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], NULL, NULL); if (err) return err; type = nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]); lower = nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]); upper = nla_get_u32(publ[TIPC_NLA_PUBL_UPPER]); if (lower == upper) tipc_tlv_sprintf(msg->rep, " {%u,%u}", type, lower); else tipc_tlv_sprintf(msg->rep, " {%u,%u,%u}", type, lower, upper); return 0; } static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock) { int err; void *hdr; struct nlattr *nest; struct sk_buff *args; struct tipc_nl_compat_cmd_dump dump; args = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!args) return -ENOMEM; hdr = genlmsg_put(args, 0, 0, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET); if (!hdr) { kfree_skb(args); return -EMSGSIZE; } nest = nla_nest_start_noflag(args, TIPC_NLA_SOCK); if (!nest) { kfree_skb(args); return -EMSGSIZE; } if (nla_put_u32(args, TIPC_NLA_SOCK_REF, sock)) { kfree_skb(args); return -EMSGSIZE; } nla_nest_end(args, nest); genlmsg_end(args, hdr); dump.dumpit = tipc_nl_publ_dump; dump.format = __tipc_nl_compat_publ_dump; err = __tipc_nl_compat_dumpit(&dump, msg, args); kfree_skb(args); return err; } static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { int err; u32 sock_ref; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; err = nla_parse_nested_deprecated(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], NULL, NULL); if (err) return err; sock_ref = nla_get_u32(sock[TIPC_NLA_SOCK_REF]); tipc_tlv_sprintf(msg->rep, "%u:", sock_ref); if (sock[TIPC_NLA_SOCK_CON]) { u32 node; struct nlattr *con[TIPC_NLA_CON_MAX + 1]; err = nla_parse_nested_deprecated(con, TIPC_NLA_CON_MAX, sock[TIPC_NLA_SOCK_CON], NULL, NULL); if (err) return err; node = nla_get_u32(con[TIPC_NLA_CON_NODE]); tipc_tlv_sprintf(msg->rep, " connected to <%u.%u.%u:%u>", tipc_zone(node), tipc_cluster(node), tipc_node(node), nla_get_u32(con[TIPC_NLA_CON_SOCK])); if (con[TIPC_NLA_CON_FLAG]) tipc_tlv_sprintf(msg->rep, " via {%u,%u}\n", nla_get_u32(con[TIPC_NLA_CON_TYPE]), nla_get_u32(con[TIPC_NLA_CON_INST])); else tipc_tlv_sprintf(msg->rep, "\n"); } else if (sock[TIPC_NLA_SOCK_HAS_PUBL]) { tipc_tlv_sprintf(msg->rep, " bound to"); err = tipc_nl_compat_publ_dump(msg, sock_ref); if (err) return err; } tipc_tlv_sprintf(msg->rep, "\n"); return 0; } static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { struct nlattr *media[TIPC_NLA_MEDIA_MAX + 1]; int err; if (!attrs[TIPC_NLA_MEDIA]) return -EINVAL; err = nla_parse_nested_deprecated(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA], NULL, NULL); if (err) return err; return tipc_add_tlv(msg->rep, TIPC_TLV_MEDIA_NAME, nla_data(media[TIPC_NLA_MEDIA_NAME]), nla_len(media[TIPC_NLA_MEDIA_NAME])); } static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { struct tipc_node_info node_info; struct nlattr *node[TIPC_NLA_NODE_MAX + 1]; int err; if (!attrs[TIPC_NLA_NODE]) return -EINVAL; err = nla_parse_nested_deprecated(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], NULL, NULL); if (err) return err; node_info.addr = htonl(nla_get_u32(node[TIPC_NLA_NODE_ADDR])); node_info.up = htonl(nla_get_flag(node[TIPC_NLA_NODE_UP])); return tipc_add_tlv(msg->rep, TIPC_TLV_NODE_INFO, &node_info, sizeof(node_info)); } static int tipc_nl_compat_net_set(struct tipc_nl_compat_cmd_doit *cmd, struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { u32 val; struct nlattr *net; val = ntohl(*(__be32 *)TLV_DATA(msg->req)); net = nla_nest_start_noflag(skb, TIPC_NLA_NET); if (!net) return -EMSGSIZE; if (msg->cmd == TIPC_CMD_SET_NODE_ADDR) { if (nla_put_u32(skb, TIPC_NLA_NET_ADDR, val)) return -EMSGSIZE; } else if (msg->cmd == TIPC_CMD_SET_NETID) { if (nla_put_u32(skb, TIPC_NLA_NET_ID, val)) return -EMSGSIZE; } nla_nest_end(skb, net); return 0; } static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { __be32 id; struct nlattr *net[TIPC_NLA_NET_MAX + 1]; int err; if (!attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], NULL, NULL); if (err) return err; id = htonl(nla_get_u32(net[TIPC_NLA_NET_ID])); return tipc_add_tlv(msg->rep, TIPC_TLV_UNSIGNED, &id, sizeof(id)); } static int tipc_cmd_show_stats_compat(struct tipc_nl_compat_msg *msg) { msg->rep = tipc_tlv_alloc(ULTRA_STRING_MAX_LEN); if (!msg->rep) return -ENOMEM; tipc_tlv_init(msg->rep, TIPC_TLV_ULTRA_STRING); tipc_tlv_sprintf(msg->rep, "TIPC version " TIPC_MOD_VER "\n"); return 0; } static int tipc_nl_compat_handle(struct tipc_nl_compat_msg *msg) { struct tipc_nl_compat_cmd_dump dump; struct tipc_nl_compat_cmd_doit doit; memset(&dump, 0, sizeof(dump)); memset(&doit, 0, sizeof(doit)); switch (msg->cmd) { case TIPC_CMD_NOOP: msg->rep = tipc_tlv_alloc(0); if (!msg->rep) return -ENOMEM; return 0; case TIPC_CMD_GET_BEARER_NAMES: msg->rep_size = MAX_BEARERS * TLV_SPACE(TIPC_MAX_BEARER_NAME); dump.dumpit = tipc_nl_bearer_dump; dump.format = tipc_nl_compat_bearer_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_ENABLE_BEARER: msg->req_type = TIPC_TLV_BEARER_CONFIG; doit.doit = __tipc_nl_bearer_enable; doit.transcode = tipc_nl_compat_bearer_enable; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_DISABLE_BEARER: msg->req_type = TIPC_TLV_BEARER_NAME; doit.doit = __tipc_nl_bearer_disable; doit.transcode = tipc_nl_compat_bearer_disable; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_SHOW_LINK_STATS: msg->req_type = TIPC_TLV_LINK_NAME; msg->rep_size = ULTRA_STRING_MAX_LEN; msg->rep_type = TIPC_TLV_ULTRA_STRING; dump.dumpit = tipc_nl_node_dump_link; dump.format = tipc_nl_compat_link_stat_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_GET_LINKS: msg->req_type = TIPC_TLV_NET_ADDR; msg->rep_size = ULTRA_STRING_MAX_LEN; dump.dumpit = tipc_nl_node_dump_link; dump.format = tipc_nl_compat_link_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_SET_LINK_TOL: case TIPC_CMD_SET_LINK_PRI: case TIPC_CMD_SET_LINK_WINDOW: msg->req_type = TIPC_TLV_LINK_CONFIG; doit.doit = tipc_nl_node_set_link; doit.transcode = tipc_nl_compat_link_set; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_RESET_LINK_STATS: msg->req_type = TIPC_TLV_LINK_NAME; doit.doit = tipc_nl_node_reset_link_stats; doit.transcode = tipc_nl_compat_link_reset_stats; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_SHOW_NAME_TABLE: msg->req_type = TIPC_TLV_NAME_TBL_QUERY; msg->rep_size = ULTRA_STRING_MAX_LEN; msg->rep_type = TIPC_TLV_ULTRA_STRING; dump.header = tipc_nl_compat_name_table_dump_header; dump.dumpit = tipc_nl_name_table_dump; dump.format = tipc_nl_compat_name_table_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_SHOW_PORTS: msg->rep_size = ULTRA_STRING_MAX_LEN; msg->rep_type = TIPC_TLV_ULTRA_STRING; dump.dumpit = tipc_nl_sk_dump; dump.format = tipc_nl_compat_sk_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_GET_MEDIA_NAMES: msg->rep_size = MAX_MEDIA * TLV_SPACE(TIPC_MAX_MEDIA_NAME); dump.dumpit = tipc_nl_media_dump; dump.format = tipc_nl_compat_media_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_GET_NODES: msg->rep_size = ULTRA_STRING_MAX_LEN; dump.dumpit = tipc_nl_node_dump; dump.format = tipc_nl_compat_node_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_SET_NODE_ADDR: msg->req_type = TIPC_TLV_NET_ADDR; doit.doit = __tipc_nl_net_set; doit.transcode = tipc_nl_compat_net_set; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_SET_NETID: msg->req_type = TIPC_TLV_UNSIGNED; doit.doit = __tipc_nl_net_set; doit.transcode = tipc_nl_compat_net_set; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_GET_NETID: msg->rep_size = sizeof(u32); dump.dumpit = tipc_nl_net_dump; dump.format = tipc_nl_compat_net_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_SHOW_STATS: return tipc_cmd_show_stats_compat(msg); } return -EOPNOTSUPP; } static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) { int err; int len; struct tipc_nl_compat_msg msg; struct nlmsghdr *req_nlh; struct nlmsghdr *rep_nlh; struct tipc_genlmsghdr *req_userhdr = genl_info_userhdr(info); memset(&msg, 0, sizeof(msg)); req_nlh = (struct nlmsghdr *)skb->data; msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; msg.net = genl_info_net(info); msg.dst_sk = skb->sk; if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN); err = -EACCES; goto send; } msg.req_size = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN); if (msg.req_size && !TLV_OK(msg.req, msg.req_size)) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); err = -EOPNOTSUPP; goto send; } err = tipc_nl_compat_handle(&msg); if ((err == -EOPNOTSUPP) || (err == -EPERM)) msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); else if (err == -EINVAL) msg.rep = tipc_get_err_tlv(TIPC_CFG_TLV_ERROR); send: if (!msg.rep) return err; len = nlmsg_total_size(GENL_HDRLEN + TIPC_GENL_HDRLEN); skb_push(msg.rep, len); rep_nlh = nlmsg_hdr(msg.rep); memcpy(rep_nlh, info->nlhdr, len); rep_nlh->nlmsg_len = msg.rep->len; genlmsg_unicast(msg.net, msg.rep, NETLINK_CB(skb).portid); return err; } static const struct genl_small_ops tipc_genl_compat_ops[] = { { .cmd = TIPC_GENL_CMD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_compat_recv, }, }; static struct genl_family tipc_genl_compat_family __ro_after_init = { .name = TIPC_GENL_NAME, .version = TIPC_GENL_VERSION, .hdrsize = TIPC_GENL_HDRLEN, .maxattr = 0, .netnsok = true, .module = THIS_MODULE, .small_ops = tipc_genl_compat_ops, .n_small_ops = ARRAY_SIZE(tipc_genl_compat_ops), .resv_start_op = TIPC_GENL_CMD + 1, }; int __init tipc_netlink_compat_start(void) { int res; res = genl_register_family(&tipc_genl_compat_family); if (res) { pr_err("Failed to register legacy compat interface\n"); return res; } return 0; } void tipc_netlink_compat_stop(void) { genl_unregister_family(&tipc_genl_compat_family); }
3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/xdr.c * * Generic XDR support. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/pagemap.h> #include <linux/errno.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/msg_prot.h> #include <linux/bvec.h> #include <trace/events/sunrpc.h> static void _copy_to_pages(struct page **, size_t, const char *, size_t); /* * XDR functions for basic NFS types */ __be32 * xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj) { unsigned int quadlen = XDR_QUADLEN(obj->len); p[quadlen] = 0; /* zero trailing bytes */ *p++ = cpu_to_be32(obj->len); memcpy(p, obj->data, obj->len); return p + XDR_QUADLEN(obj->len); } EXPORT_SYMBOL_GPL(xdr_encode_netobj); /** * xdr_encode_opaque_fixed - Encode fixed length opaque data * @p: pointer to current position in XDR buffer. * @ptr: pointer to data to encode (or NULL) * @nbytes: size of data. * * Copy the array of data of length nbytes at ptr to the XDR buffer * at position p, then align to the next 32-bit boundary by padding * with zero bytes (see RFC1832). * Note: if ptr is NULL, only the padding is performed. * * Returns the updated current XDR buffer position * */ __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int nbytes) { if (likely(nbytes != 0)) { unsigned int quadlen = XDR_QUADLEN(nbytes); unsigned int padding = (quadlen << 2) - nbytes; if (ptr != NULL) memcpy(p, ptr, nbytes); if (padding != 0) memset((char *)p + nbytes, 0, padding); p += quadlen; } return p; } EXPORT_SYMBOL_GPL(xdr_encode_opaque_fixed); /** * xdr_encode_opaque - Encode variable length opaque data * @p: pointer to current position in XDR buffer. * @ptr: pointer to data to encode (or NULL) * @nbytes: size of data. * * Returns the updated current XDR buffer position */ __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int nbytes) { *p++ = cpu_to_be32(nbytes); return xdr_encode_opaque_fixed(p, ptr, nbytes); } EXPORT_SYMBOL_GPL(xdr_encode_opaque); __be32 * xdr_encode_string(__be32 *p, const char *string) { return xdr_encode_array(p, string, strlen(string)); } EXPORT_SYMBOL_GPL(xdr_encode_string); /** * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf * @buf: XDR buffer where string resides * @len: length of string, in bytes * */ void xdr_terminate_string(const struct xdr_buf *buf, const u32 len) { char *kaddr; kaddr = kmap_atomic(buf->pages[0]); kaddr[buf->page_base + len] = '\0'; kunmap_atomic(kaddr); } EXPORT_SYMBOL_GPL(xdr_terminate_string); size_t xdr_buf_pagecount(const struct xdr_buf *buf) { if (!buf->page_len) return 0; return (buf->page_base + buf->page_len + PAGE_SIZE - 1) >> PAGE_SHIFT; } int xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) { size_t i, n = xdr_buf_pagecount(buf); if (n != 0 && buf->bvec == NULL) { buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); if (!buf->bvec) return -ENOMEM; for (i = 0; i < n; i++) { bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE, 0); } } return 0; } void xdr_free_bvec(struct xdr_buf *buf) { kfree(buf->bvec); buf->bvec = NULL; } /** * xdr_buf_to_bvec - Copy components of an xdr_buf into a bio_vec array * @bvec: bio_vec array to populate * @bvec_size: element count of @bio_vec * @xdr: xdr_buf to be copied * * Returns the number of entries consumed in @bvec. */ unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size, const struct xdr_buf *xdr) { const struct kvec *head = xdr->head; const struct kvec *tail = xdr->tail; unsigned int count = 0; if (head->iov_len) { bvec_set_virt(bvec++, head->iov_base, head->iov_len); ++count; } if (xdr->page_len) { unsigned int offset, len, remaining; struct page **pages = xdr->pages; offset = offset_in_page(xdr->page_base); remaining = xdr->page_len; while (remaining > 0) { len = min_t(unsigned int, remaining, PAGE_SIZE - offset); bvec_set_page(bvec++, *pages++, len, offset); remaining -= len; offset = 0; if (unlikely(++count > bvec_size)) goto bvec_overflow; } } if (tail->iov_len) { bvec_set_virt(bvec, tail->iov_base, tail->iov_len); if (unlikely(++count > bvec_size)) goto bvec_overflow; } return count; bvec_overflow: pr_warn_once("%s: bio_vec array overflow\n", __func__); return count - 1; } EXPORT_SYMBOL_GPL(xdr_buf_to_bvec); /** * xdr_inline_pages - Prepare receive buffer for a large reply * @xdr: xdr_buf into which reply will be placed * @offset: expected offset where data payload will start, in bytes * @pages: vector of struct page pointers * @base: offset in first page where receive should start, in bytes * @len: expected size of the upper layer data payload, in bytes * */ void xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, struct page **pages, unsigned int base, unsigned int len) { struct kvec *head = xdr->head; struct kvec *tail = xdr->tail; char *buf = (char *)head->iov_base; unsigned int buflen = head->iov_len; head->iov_len = offset; xdr->pages = pages; xdr->page_base = base; xdr->page_len = len; tail->iov_base = buf + offset; tail->iov_len = buflen - offset; xdr->buflen += len; } EXPORT_SYMBOL_GPL(xdr_inline_pages); /* * Helper routines for doing 'memmove' like operations on a struct xdr_buf */ /** * _shift_data_left_pages * @pages: vector of pages containing both the source and dest memory area. * @pgto_base: page vector address of destination * @pgfrom_base: page vector address of source * @len: number of bytes to copy * * Note: the addresses pgto_base and pgfrom_base are both calculated in * the same way: * if a memory area starts at byte 'base' in page 'pages[i]', * then its address is given as (i << PAGE_CACHE_SHIFT) + base * Alse note: pgto_base must be < pgfrom_base, but the memory areas * they point to may overlap. */ static void _shift_data_left_pages(struct page **pages, size_t pgto_base, size_t pgfrom_base, size_t len) { struct page **pgfrom, **pgto; char *vfrom, *vto; size_t copy; BUG_ON(pgfrom_base <= pgto_base); if (!len) return; pgto = pages + (pgto_base >> PAGE_SHIFT); pgfrom = pages + (pgfrom_base >> PAGE_SHIFT); pgto_base &= ~PAGE_MASK; pgfrom_base &= ~PAGE_MASK; do { if (pgto_base >= PAGE_SIZE) { pgto_base = 0; pgto++; } if (pgfrom_base >= PAGE_SIZE){ pgfrom_base = 0; pgfrom++; } copy = len; if (copy > (PAGE_SIZE - pgto_base)) copy = PAGE_SIZE - pgto_base; if (copy > (PAGE_SIZE - pgfrom_base)) copy = PAGE_SIZE - pgfrom_base; vto = kmap_atomic(*pgto); if (*pgto != *pgfrom) { vfrom = kmap_atomic(*pgfrom); memcpy(vto + pgto_base, vfrom + pgfrom_base, copy); kunmap_atomic(vfrom); } else memmove(vto + pgto_base, vto + pgfrom_base, copy); flush_dcache_page(*pgto); kunmap_atomic(vto); pgto_base += copy; pgfrom_base += copy; } while ((len -= copy) != 0); } /** * _shift_data_right_pages * @pages: vector of pages containing both the source and dest memory area. * @pgto_base: page vector address of destination * @pgfrom_base: page vector address of source * @len: number of bytes to copy * * Note: the addresses pgto_base and pgfrom_base are both calculated in * the same way: * if a memory area starts at byte 'base' in page 'pages[i]', * then its address is given as (i << PAGE_SHIFT) + base * Also note: pgfrom_base must be < pgto_base, but the memory areas * they point to may overlap. */ static void _shift_data_right_pages(struct page **pages, size_t pgto_base, size_t pgfrom_base, size_t len) { struct page **pgfrom, **pgto; char *vfrom, *vto; size_t copy; BUG_ON(pgto_base <= pgfrom_base); if (!len) return; pgto_base += len; pgfrom_base += len; pgto = pages + (pgto_base >> PAGE_SHIFT); pgfrom = pages + (pgfrom_base >> PAGE_SHIFT); pgto_base &= ~PAGE_MASK; pgfrom_base &= ~PAGE_MASK; do { /* Are any pointers crossing a page boundary? */ if (pgto_base == 0) { pgto_base = PAGE_SIZE; pgto--; } if (pgfrom_base == 0) { pgfrom_base = PAGE_SIZE; pgfrom--; } copy = len; if (copy > pgto_base) copy = pgto_base; if (copy > pgfrom_base) copy = pgfrom_base; pgto_base -= copy; pgfrom_base -= copy; vto = kmap_atomic(*pgto); if (*pgto != *pgfrom) { vfrom = kmap_atomic(*pgfrom); memcpy(vto + pgto_base, vfrom + pgfrom_base, copy); kunmap_atomic(vfrom); } else memmove(vto + pgto_base, vto + pgfrom_base, copy); flush_dcache_page(*pgto); kunmap_atomic(vto); } while ((len -= copy) != 0); } /** * _copy_to_pages * @pages: array of pages * @pgbase: page vector address of destination * @p: pointer to source data * @len: length * * Copies data from an arbitrary memory location into an array of pages * The copy is assumed to be non-overlapping. */ static void _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) { struct page **pgto; char *vto; size_t copy; if (!len) return; pgto = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; for (;;) { copy = PAGE_SIZE - pgbase; if (copy > len) copy = len; vto = kmap_atomic(*pgto); memcpy(vto + pgbase, p, copy); kunmap_atomic(vto); len -= copy; if (len == 0) break; pgbase += copy; if (pgbase == PAGE_SIZE) { flush_dcache_page(*pgto); pgbase = 0; pgto++; } p += copy; } flush_dcache_page(*pgto); } /** * _copy_from_pages * @p: pointer to destination * @pages: array of pages * @pgbase: offset of source data * @len: length * * Copies data into an arbitrary memory location from an array of pages * The copy is assumed to be non-overlapping. */ void _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) { struct page **pgfrom; char *vfrom; size_t copy; if (!len) return; pgfrom = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; do { copy = PAGE_SIZE - pgbase; if (copy > len) copy = len; vfrom = kmap_atomic(*pgfrom); memcpy(p, vfrom + pgbase, copy); kunmap_atomic(vfrom); pgbase += copy; if (pgbase == PAGE_SIZE) { pgbase = 0; pgfrom++; } p += copy; } while ((len -= copy) != 0); } EXPORT_SYMBOL_GPL(_copy_from_pages); static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base, unsigned int len) { if (base >= iov->iov_len) return; if (len > iov->iov_len - base) len = iov->iov_len - base; memset(iov->iov_base + base, 0, len); } /** * xdr_buf_pages_zero * @buf: xdr_buf * @pgbase: beginning offset * @len: length */ static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase, unsigned int len) { struct page **pages = buf->pages; struct page **page; char *vpage; unsigned int zero; if (!len) return; if (pgbase >= buf->page_len) { xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len); return; } if (pgbase + len > buf->page_len) { xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len); len = buf->page_len - pgbase; } pgbase += buf->page_base; page = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; do { zero = PAGE_SIZE - pgbase; if (zero > len) zero = len; vpage = kmap_atomic(*page); memset(vpage + pgbase, 0, zero); kunmap_atomic(vpage); flush_dcache_page(*page); pgbase = 0; page++; } while ((len -= zero) != 0); } static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf, unsigned int buflen, gfp_t gfp) { unsigned int i, npages, pagelen; if (!(buf->flags & XDRBUF_SPARSE_PAGES)) return buflen; if (buflen <= buf->head->iov_len) return buflen; pagelen = buflen - buf->head->iov_len; if (pagelen > buf->page_len) pagelen = buf->page_len; npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (!buf->pages[i]) continue; buf->pages[i] = alloc_page(gfp); if (likely(buf->pages[i])) continue; buflen -= pagelen; pagelen = i << PAGE_SHIFT; if (pagelen > buf->page_base) buflen += pagelen - buf->page_base; break; } return buflen; } static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len) { struct kvec *head = buf->head; struct kvec *tail = buf->tail; unsigned int sum = head->iov_len + buf->page_len + tail->iov_len; unsigned int free_space, newlen; if (sum > buf->len) { free_space = min_t(unsigned int, sum - buf->len, len); newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space, GFP_KERNEL); free_space = newlen - buf->len; buf->len = newlen; len -= free_space; if (!len) return; } if (buf->buflen > sum) { /* Expand the tail buffer */ free_space = min_t(unsigned int, buf->buflen - sum, len); tail->iov_len += free_space; buf->len += free_space; } } static void xdr_buf_tail_copy_right(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { const struct kvec *tail = buf->tail; unsigned int to = base + shift; if (to >= tail->iov_len) return; if (len + to > tail->iov_len) len = tail->iov_len - to; memmove(tail->iov_base + to, tail->iov_base + base, len); } static void xdr_buf_pages_copy_right(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { const struct kvec *tail = buf->tail; unsigned int to = base + shift; unsigned int pglen = 0; unsigned int talen = 0, tato = 0; if (base >= buf->page_len) return; if (len > buf->page_len - base) len = buf->page_len - base; if (to >= buf->page_len) { tato = to - buf->page_len; if (tail->iov_len >= len + tato) talen = len; else if (tail->iov_len > tato) talen = tail->iov_len - tato; } else if (len + to >= buf->page_len) { pglen = buf->page_len - to; talen = len - pglen; if (talen > tail->iov_len) talen = tail->iov_len; } else pglen = len; _copy_from_pages(tail->iov_base + tato, buf->pages, buf->page_base + base + pglen, talen); _shift_data_right_pages(buf->pages, buf->page_base + to, buf->page_base + base, pglen); } static void xdr_buf_head_copy_right(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { const struct kvec *head = buf->head; const struct kvec *tail = buf->tail; unsigned int to = base + shift; unsigned int pglen = 0, pgto = 0; unsigned int talen = 0, tato = 0; if (base >= head->iov_len) return; if (len > head->iov_len - base) len = head->iov_len - base; if (to >= buf->page_len + head->iov_len) { tato = to - buf->page_len - head->iov_len; talen = len; } else if (to >= head->iov_len) { pgto = to - head->iov_len; pglen = len; if (pgto + pglen > buf->page_len) { talen = pgto + pglen - buf->page_len; pglen -= talen; } } else { pglen = len - to; if (pglen > buf->page_len) { talen = pglen - buf->page_len; pglen = buf->page_len; } } len -= talen; base += len; if (talen + tato > tail->iov_len) talen = tail->iov_len > tato ? tail->iov_len - tato : 0; memcpy(tail->iov_base + tato, head->iov_base + base, talen); len -= pglen; base -= pglen; _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base, pglen); base -= len; memmove(head->iov_base + to, head->iov_base + base, len); } static void xdr_buf_tail_shift_right(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { const struct kvec *tail = buf->tail; if (base >= tail->iov_len || !shift || !len) return; xdr_buf_tail_copy_right(buf, base, len, shift); } static void xdr_buf_pages_shift_right(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { if (!shift || !len) return; if (base >= buf->page_len) { xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift); return; } if (base + len > buf->page_len) xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len, shift); xdr_buf_pages_copy_right(buf, base, len, shift); } static void xdr_buf_head_shift_right(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { const struct kvec *head = buf->head; if (!shift) return; if (base >= head->iov_len) { xdr_buf_pages_shift_right(buf, head->iov_len - base, len, shift); return; } if (base + len > head->iov_len) xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len, shift); xdr_buf_head_copy_right(buf, base, len, shift); } static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { const struct kvec *tail = buf->tail; if (base >= tail->iov_len) return; if (len > tail->iov_len - base) len = tail->iov_len - base; /* Shift data into head */ if (shift > buf->page_len + base) { const struct kvec *head = buf->head; unsigned int hdto = head->iov_len + buf->page_len + base - shift; unsigned int hdlen = len; if (WARN_ONCE(shift > head->iov_len + buf->page_len + base, "SUNRPC: Misaligned data.\n")) return; if (hdto + hdlen > head->iov_len) hdlen = head->iov_len - hdto; memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen); base += hdlen; len -= hdlen; if (!len) return; } /* Shift data into pages */ if (shift > base) { unsigned int pgto = buf->page_len + base - shift; unsigned int pglen = len; if (pgto + pglen > buf->page_len) pglen = buf->page_len - pgto; _copy_to_pages(buf->pages, buf->page_base + pgto, tail->iov_base + base, pglen); base += pglen; len -= pglen; if (!len) return; } memmove(tail->iov_base + base - shift, tail->iov_base + base, len); } static void xdr_buf_pages_copy_left(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { unsigned int pgto; if (base >= buf->page_len) return; if (len > buf->page_len - base) len = buf->page_len - base; /* Shift data into head */ if (shift > base) { const struct kvec *head = buf->head; unsigned int hdto = head->iov_len + base - shift; unsigned int hdlen = len; if (WARN_ONCE(shift > head->iov_len + base, "SUNRPC: Misaligned data.\n")) return; if (hdto + hdlen > head->iov_len) hdlen = head->iov_len - hdto; _copy_from_pages(head->iov_base + hdto, buf->pages, buf->page_base + base, hdlen); base += hdlen; len -= hdlen; if (!len) return; } pgto = base - shift; _shift_data_left_pages(buf->pages, buf->page_base + pgto, buf->page_base + base, len); } static void xdr_buf_tail_shift_left(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { if (!shift || !len) return; xdr_buf_tail_copy_left(buf, base, len, shift); } static void xdr_buf_pages_shift_left(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { if (!shift || !len) return; if (base >= buf->page_len) { xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift); return; } xdr_buf_pages_copy_left(buf, base, len, shift); len += base; if (len <= buf->page_len) return; xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift); } static void xdr_buf_head_shift_left(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { const struct kvec *head = buf->head; unsigned int bytes; if (!shift || !len) return; if (shift > base) { bytes = (shift - base); if (bytes >= len) return; base += bytes; len -= bytes; } if (base < head->iov_len) { bytes = min_t(unsigned int, len, head->iov_len - base); memmove(head->iov_base + (base - shift), head->iov_base + base, bytes); base += bytes; len -= bytes; } xdr_buf_pages_shift_left(buf, base - head->iov_len, len, shift); } /** * xdr_shrink_bufhead * @buf: xdr_buf * @len: new length of buf->head[0] * * Shrinks XDR buffer's header kvec buf->head[0], setting it to * 'len' bytes. The extra data is not lost, but is instead * moved into the inlined pages and/or the tail. */ static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len) { struct kvec *head = buf->head; unsigned int shift, buflen = max(buf->len, len); WARN_ON_ONCE(len > head->iov_len); if (head->iov_len > buflen) { buf->buflen -= head->iov_len - buflen; head->iov_len = buflen; } if (len >= head->iov_len) return 0; shift = head->iov_len - len; xdr_buf_try_expand(buf, shift); xdr_buf_head_shift_right(buf, len, buflen - len, shift); head->iov_len = len; buf->buflen -= shift; buf->len -= shift; return shift; } /** * xdr_shrink_pagelen - shrinks buf->pages to @len bytes * @buf: xdr_buf * @len: new page buffer length * * The extra data is not lost, but is instead moved into buf->tail. * Returns the actual number of bytes moved. */ static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len) { unsigned int shift, buflen = buf->len - buf->head->iov_len; WARN_ON_ONCE(len > buf->page_len); if (buf->head->iov_len >= buf->len || len > buflen) buflen = len; if (buf->page_len > buflen) { buf->buflen -= buf->page_len - buflen; buf->page_len = buflen; } if (len >= buf->page_len) return 0; shift = buf->page_len - len; xdr_buf_try_expand(buf, shift); xdr_buf_pages_shift_right(buf, len, buflen - len, shift); buf->page_len = len; buf->len -= shift; buf->buflen -= shift; return shift; } /** * xdr_stream_pos - Return the current offset from the start of the xdr_stream * @xdr: pointer to struct xdr_stream */ unsigned int xdr_stream_pos(const struct xdr_stream *xdr) { return (unsigned int)(XDR_QUADLEN(xdr->buf->len) - xdr->nwords) << 2; } EXPORT_SYMBOL_GPL(xdr_stream_pos); static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos) { unsigned int blen = xdr->buf->len; xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0; } static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos) { xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len); } /** * xdr_page_pos - Return the current offset from the start of the xdr pages * @xdr: pointer to struct xdr_stream */ unsigned int xdr_page_pos(const struct xdr_stream *xdr) { unsigned int pos = xdr_stream_pos(xdr); WARN_ON(pos < xdr->buf->head[0].iov_len); return pos - xdr->buf->head[0].iov_len; } EXPORT_SYMBOL_GPL(xdr_page_pos); /** * xdr_init_encode - Initialize a struct xdr_stream for sending data. * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer in which to encode data * @p: current pointer inside XDR buffer * @rqst: pointer to controlling rpc_rqst, for debugging * * Note: at the moment the RPC client only passes the length of our * scratch buffer in the xdr_buf's header kvec. Previously this * meant we needed to call xdr_adjust_iovec() after encoding the * data. With the new scheme, the xdr_stream manages the details * of the buffer length, and takes care of adjusting the kvec * length for us. */ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst) { struct kvec *iov = buf->head; int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len; xdr_reset_scratch_buffer(xdr); BUG_ON(scratch_len < 0); xdr->buf = buf; xdr->iov = iov; xdr->p = (__be32 *)((char *)iov->iov_base + iov->iov_len); xdr->end = (__be32 *)((char *)iov->iov_base + scratch_len); BUG_ON(iov->iov_len > scratch_len); if (p != xdr->p && p != NULL) { size_t len; BUG_ON(p < xdr->p || p > xdr->end); len = (char *)p - (char *)xdr->p; xdr->p = p; buf->len += len; iov->iov_len += len; } xdr->rqst = rqst; } EXPORT_SYMBOL_GPL(xdr_init_encode); /** * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer into which to encode data * */ void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf) { xdr_reset_scratch_buffer(xdr); xdr->buf = buf; xdr->page_ptr = buf->pages; xdr->iov = NULL; xdr->p = page_address(*xdr->page_ptr); xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); xdr->rqst = NULL; } EXPORT_SYMBOL_GPL(xdr_init_encode_pages); /** * __xdr_commit_encode - Ensure all data is written to buffer * @xdr: pointer to xdr_stream * * We handle encoding across page boundaries by giving the caller a * temporary location to write to, then later copying the data into * place; xdr_commit_encode does that copying. * * Normally the caller doesn't need to call this directly, as the * following xdr_reserve_space will do it. But an explicit call may be * required at the end of encoding, or any other time when the xdr_buf * data might be read. */ void __xdr_commit_encode(struct xdr_stream *xdr) { size_t shift = xdr->scratch.iov_len; void *page; page = page_address(*xdr->page_ptr); memcpy(xdr->scratch.iov_base, page, shift); memmove(page, page + shift, (void *)xdr->p - page); xdr_reset_scratch_buffer(xdr); } EXPORT_SYMBOL_GPL(__xdr_commit_encode); /* * The buffer space to be reserved crosses the boundary between * xdr->buf->head and xdr->buf->pages, or between two pages * in xdr->buf->pages. */ static noinline __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes) { int space_left; int frag1bytes, frag2bytes; void *p; if (nbytes > PAGE_SIZE) goto out_overflow; /* Bigger buffers require special handling */ if (xdr->buf->len + nbytes > xdr->buf->buflen) goto out_overflow; /* Sorry, we're totally out of space */ frag1bytes = (xdr->end - xdr->p) << 2; frag2bytes = nbytes - frag1bytes; if (xdr->iov) xdr->iov->iov_len += frag1bytes; else xdr->buf->page_len += frag1bytes; xdr->page_ptr++; xdr->iov = NULL; /* * If the last encode didn't end exactly on a page boundary, the * next one will straddle boundaries. Encode into the next * page, then copy it back later in xdr_commit_encode. We use * the "scratch" iov to track any temporarily unused fragment of * space at the end of the previous buffer: */ xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes); /* * xdr->p is where the next encode will start after * xdr_commit_encode() has shifted this one back: */ p = page_address(*xdr->page_ptr); xdr->p = p + frag2bytes; space_left = xdr->buf->buflen - xdr->buf->len; if (space_left - frag1bytes >= PAGE_SIZE) xdr->end = p + PAGE_SIZE; else xdr->end = p + space_left - frag1bytes; xdr->buf->page_len += frag2bytes; xdr->buf->len += nbytes; return p; out_overflow: trace_rpc_xdr_overflow(xdr, nbytes); return NULL; } /** * xdr_reserve_space - Reserve buffer space for sending * @xdr: pointer to xdr_stream * @nbytes: number of bytes to reserve * * Checks that we have enough buffer space to encode 'nbytes' more * bytes of data. If so, update the total xdr_buf length, and * adjust the length of the current kvec. * * The returned pointer is valid only until the next call to * xdr_reserve_space() or xdr_commit_encode() on @xdr. The current * implementation of this API guarantees that space reserved for a * four-byte data item remains valid until @xdr is destroyed, but * that might not always be true in the future. */ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) { __be32 *p = xdr->p; __be32 *q; xdr_commit_encode(xdr); /* align nbytes on the next 32-bit boundary */ nbytes += 3; nbytes &= ~3; q = p + (nbytes >> 2); if (unlikely(q > xdr->end || q < p)) return xdr_get_next_encode_buffer(xdr, nbytes); xdr->p = q; if (xdr->iov) xdr->iov->iov_len += nbytes; else xdr->buf->page_len += nbytes; xdr->buf->len += nbytes; return p; } EXPORT_SYMBOL_GPL(xdr_reserve_space); /** * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending * @xdr: pointer to xdr_stream * @nbytes: number of bytes to reserve * * The size argument passed to xdr_reserve_space() is determined based * on the number of bytes remaining in the current page to avoid * invalidating iov_base pointers when xdr_commit_encode() is called. * * Return values: * %0: success * %-EMSGSIZE: not enough space is available in @xdr */ int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes) { size_t thislen; __be32 *p; /* * svcrdma requires every READ payload to start somewhere * in xdr->pages. */ if (xdr->iov == xdr->buf->head) { xdr->iov = NULL; xdr->end = xdr->p; } /* XXX: Let's find a way to make this more efficient */ while (nbytes) { thislen = xdr->buf->page_len % PAGE_SIZE; thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen); p = xdr_reserve_space(xdr, thislen); if (!p) return -EMSGSIZE; nbytes -= thislen; } return 0; } EXPORT_SYMBOL_GPL(xdr_reserve_space_vec); /** * xdr_truncate_encode - truncate an encode buffer * @xdr: pointer to xdr_stream * @len: new length of buffer * * Truncates the xdr stream, so that xdr->buf->len == len, * and xdr->p points at offset len from the start of the buffer, and * head, tail, and page lengths are adjusted to correspond. * * If this means moving xdr->p to a different buffer, we assume that * the end pointer should be set to the end of the current page, * except in the case of the head buffer when we assume the head * buffer's current length represents the end of the available buffer. * * This is *not* safe to use on a buffer that already has inlined page * cache pages (as in a zero-copy server read reply), except for the * simple case of truncating from one position in the tail to another. * */ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) { struct xdr_buf *buf = xdr->buf; struct kvec *head = buf->head; struct kvec *tail = buf->tail; int fraglen; int new; if (len > buf->len) { WARN_ON_ONCE(1); return; } xdr_commit_encode(xdr); fraglen = min_t(int, buf->len - len, tail->iov_len); tail->iov_len -= fraglen; buf->len -= fraglen; if (tail->iov_len) { xdr->p = tail->iov_base + tail->iov_len; WARN_ON_ONCE(!xdr->end); WARN_ON_ONCE(!xdr->iov); return; } WARN_ON_ONCE(fraglen); fraglen = min_t(int, buf->len - len, buf->page_len); buf->page_len -= fraglen; buf->len -= fraglen; new = buf->page_base + buf->page_len; xdr->page_ptr = buf->pages + (new >> PAGE_SHIFT); if (buf->page_len) { xdr->p = page_address(*xdr->page_ptr); xdr->end = (void *)xdr->p + PAGE_SIZE; xdr->p = (void *)xdr->p + (new % PAGE_SIZE); WARN_ON_ONCE(xdr->iov); return; } if (fraglen) xdr->end = head->iov_base + head->iov_len; /* (otherwise assume xdr->end is already set) */ xdr->page_ptr--; head->iov_len = len; buf->len = len; xdr->p = head->iov_base + head->iov_len; xdr->iov = buf->head; } EXPORT_SYMBOL(xdr_truncate_encode); /** * xdr_truncate_decode - Truncate a decoding stream * @xdr: pointer to struct xdr_stream * @len: Number of bytes to remove * */ void xdr_truncate_decode(struct xdr_stream *xdr, size_t len) { unsigned int nbytes = xdr_align_size(len); xdr->buf->len -= nbytes; xdr->nwords -= XDR_QUADLEN(nbytes); } EXPORT_SYMBOL_GPL(xdr_truncate_decode); /** * xdr_restrict_buflen - decrease available buffer space * @xdr: pointer to xdr_stream * @newbuflen: new maximum number of bytes available * * Adjust our idea of how much space is available in the buffer. * If we've already used too much space in the buffer, returns -1. * If the available space is already smaller than newbuflen, returns 0 * and does nothing. Otherwise, adjusts xdr->buf->buflen to newbuflen * and ensures xdr->end is set at most offset newbuflen from the start * of the buffer. */ int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen) { struct xdr_buf *buf = xdr->buf; int left_in_this_buf = (void *)xdr->end - (void *)xdr->p; int end_offset = buf->len + left_in_this_buf; if (newbuflen < 0 || newbuflen < buf->len) return -1; if (newbuflen > buf->buflen) return 0; if (newbuflen < end_offset) xdr->end = (void *)xdr->end + newbuflen - end_offset; buf->buflen = newbuflen; return 0; } EXPORT_SYMBOL(xdr_restrict_buflen); /** * xdr_write_pages - Insert a list of pages into an XDR buffer for sending * @xdr: pointer to xdr_stream * @pages: array of pages to insert * @base: starting offset of first data byte in @pages * @len: number of data bytes in @pages to insert * * After the @pages are added, the tail iovec is instantiated pointing to * end of the head buffer, and the stream is set up to encode subsequent * items into the tail. */ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len) { struct xdr_buf *buf = xdr->buf; struct kvec *tail = buf->tail; buf->pages = pages; buf->page_base = base; buf->page_len = len; tail->iov_base = xdr->p; tail->iov_len = 0; xdr->iov = tail; if (len & 3) { unsigned int pad = 4 - (len & 3); BUG_ON(xdr->p >= xdr->end); tail->iov_base = (char *)xdr->p + (len & 3); tail->iov_len += pad; len += pad; *xdr->p++ = 0; } buf->buflen += len; buf->len += len; } EXPORT_SYMBOL_GPL(xdr_write_pages); static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, unsigned int base, unsigned int len) { if (len > iov->iov_len) len = iov->iov_len; if (unlikely(base > len)) base = len; xdr->p = (__be32*)(iov->iov_base + base); xdr->end = (__be32*)(iov->iov_base + len); xdr->iov = iov; xdr->page_ptr = NULL; return len - base; } static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, unsigned int base, unsigned int len) { struct xdr_buf *buf = xdr->buf; xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len); return xdr_set_iov(xdr, buf->tail, base, len); } static void xdr_stream_unmap_current_page(struct xdr_stream *xdr) { if (xdr->page_kaddr) { kunmap_local(xdr->page_kaddr); xdr->page_kaddr = NULL; } } static unsigned int xdr_set_page_base(struct xdr_stream *xdr, unsigned int base, unsigned int len) { unsigned int pgnr; unsigned int maxlen; unsigned int pgoff; unsigned int pgend; void *kaddr; maxlen = xdr->buf->page_len; if (base >= maxlen) return 0; else maxlen -= base; if (len > maxlen) len = maxlen; xdr_stream_unmap_current_page(xdr); xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; xdr->page_ptr = &xdr->buf->pages[pgnr]; if (PageHighMem(*xdr->page_ptr)) { xdr->page_kaddr = kmap_local_page(*xdr->page_ptr); kaddr = xdr->page_kaddr; } else kaddr = page_address(*xdr->page_ptr); pgoff = base & ~PAGE_MASK; xdr->p = (__be32*)(kaddr + pgoff); pgend = pgoff + len; if (pgend > PAGE_SIZE) pgend = PAGE_SIZE; xdr->end = (__be32*)(kaddr + pgend); xdr->iov = NULL; return len; } static void xdr_set_page(struct xdr_stream *xdr, unsigned int base, unsigned int len) { if (xdr_set_page_base(xdr, base, len) == 0) { base -= xdr->buf->page_len; xdr_set_tail_base(xdr, base, len); } } static void xdr_set_next_page(struct xdr_stream *xdr) { unsigned int newbase; newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT; newbase -= xdr->buf->page_base; if (newbase < xdr->buf->page_len) xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr)); else xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr)); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) { if (xdr->page_ptr != NULL) xdr_set_next_page(xdr); else if (xdr->iov == xdr->buf->head) xdr_set_page(xdr, 0, xdr_stream_remaining(xdr)); return xdr->p != xdr->end; } /** * xdr_init_decode - Initialize an xdr_stream for decoding data. * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer from which to decode data * @p: current pointer inside XDR buffer * @rqst: pointer to controlling rpc_rqst, for debugging */ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst) { xdr->buf = buf; xdr->page_kaddr = NULL; xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && xdr_set_page_base(xdr, 0, buf->len) == 0) xdr_set_iov(xdr, buf->tail, 0, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; } xdr->rqst = rqst; } EXPORT_SYMBOL_GPL(xdr_init_decode); /** * xdr_init_decode_pages - Initialize an xdr_stream for decoding into pages * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer from which to decode data * @pages: list of pages to decode into * @len: length in bytes of buffer in pages */ void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, unsigned int len) { memset(buf, 0, sizeof(*buf)); buf->pages = pages; buf->page_len = len; buf->buflen = len; buf->len = len; xdr_init_decode(xdr, buf, NULL, NULL); } EXPORT_SYMBOL_GPL(xdr_init_decode_pages); /** * xdr_finish_decode - Clean up the xdr_stream after decoding data. * @xdr: pointer to xdr_stream struct */ void xdr_finish_decode(struct xdr_stream *xdr) { xdr_stream_unmap_current_page(xdr); } EXPORT_SYMBOL(xdr_finish_decode); static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) { unsigned int nwords = XDR_QUADLEN(nbytes); __be32 *p = xdr->p; __be32 *q = p + nwords; if (unlikely(nwords > xdr->nwords || q > xdr->end || q < p)) return NULL; xdr->p = q; xdr->nwords -= nwords; return p; } static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; char *cpdest = xdr->scratch.iov_base; size_t cplen = (char *)xdr->end - (char *)xdr->p; if (nbytes > xdr->scratch.iov_len) goto out_overflow; p = __xdr_inline_decode(xdr, cplen); if (p == NULL) return NULL; memcpy(cpdest, p, cplen); if (!xdr_set_next_buffer(xdr)) goto out_overflow; cpdest += cplen; nbytes -= cplen; p = __xdr_inline_decode(xdr, nbytes); if (p == NULL) return NULL; memcpy(cpdest, p, nbytes); return xdr->scratch.iov_base; out_overflow: trace_rpc_xdr_overflow(xdr, nbytes); return NULL; } /** * xdr_inline_decode - Retrieve XDR data to decode * @xdr: pointer to xdr_stream struct * @nbytes: number of bytes of data to decode * * Check if the input buffer is long enough to enable us to decode * 'nbytes' more bytes of data starting at the current position. * If so return the current pointer, then update the current * pointer position. */ __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; if (unlikely(nbytes == 0)) return xdr->p; if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr)) goto out_overflow; p = __xdr_inline_decode(xdr, nbytes); if (p != NULL) return p; return xdr_copy_to_scratch(xdr, nbytes); out_overflow: trace_rpc_xdr_overflow(xdr, nbytes); return NULL; } EXPORT_SYMBOL_GPL(xdr_inline_decode); static void xdr_realign_pages(struct xdr_stream *xdr) { struct xdr_buf *buf = xdr->buf; struct kvec *iov = buf->head; unsigned int cur = xdr_stream_pos(xdr); unsigned int copied; /* Realign pages to current pointer position */ if (iov->iov_len > cur) { copied = xdr_shrink_bufhead(buf, cur); trace_rpc_xdr_alignment(xdr, cur, copied); xdr_set_page(xdr, 0, buf->page_len); } } static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) { struct xdr_buf *buf = xdr->buf; unsigned int nwords = XDR_QUADLEN(len); unsigned int copied; if (xdr->nwords == 0) return 0; xdr_realign_pages(xdr); if (nwords > xdr->nwords) { nwords = xdr->nwords; len = nwords << 2; } if (buf->page_len <= len) len = buf->page_len; else if (nwords < xdr->nwords) { /* Truncate page data and move it into the tail */ copied = xdr_shrink_pagelen(buf, len); trace_rpc_xdr_alignment(xdr, len, copied); } return len; } /** * xdr_read_pages - align page-based XDR data to current pointer position * @xdr: pointer to xdr_stream struct * @len: number of bytes of page data * * Moves data beyond the current pointer position from the XDR head[] buffer * into the page list. Any data that lies beyond current position + @len * bytes is moved into the XDR tail[]. The xdr_stream current position is * then advanced past that data to align to the next XDR object in the tail. * * Returns the number of XDR encoded bytes now contained in the pages */ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) { unsigned int nwords = XDR_QUADLEN(len); unsigned int base, end, pglen; pglen = xdr_align_pages(xdr, nwords << 2); if (pglen == 0) return 0; base = (nwords << 2) - pglen; end = xdr_stream_remaining(xdr) - pglen; xdr_set_tail_base(xdr, base, end); return len <= pglen ? len : pglen; } EXPORT_SYMBOL_GPL(xdr_read_pages); /** * xdr_set_pagelen - Sets the length of the XDR pages * @xdr: pointer to xdr_stream struct * @len: new length of the XDR page data * * Either grows or shrinks the length of the xdr pages by setting pagelen to * @len bytes. When shrinking, any extra data is moved into buf->tail, whereas * when growing any data beyond the current pointer is moved into the tail. * * Returns True if the operation was successful, and False otherwise. */ void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len) { struct xdr_buf *buf = xdr->buf; size_t remaining = xdr_stream_remaining(xdr); size_t base = 0; if (len < buf->page_len) { base = buf->page_len - len; xdr_shrink_pagelen(buf, len); } else { xdr_buf_head_shift_right(buf, xdr_stream_pos(xdr), buf->page_len, remaining); if (len > buf->page_len) xdr_buf_try_expand(buf, len - buf->page_len); } xdr_set_tail_base(xdr, base, remaining); } EXPORT_SYMBOL_GPL(xdr_set_pagelen); /** * xdr_enter_page - decode data from the XDR page * @xdr: pointer to xdr_stream struct * @len: number of bytes of page data * * Moves data beyond the current pointer position from the XDR head[] buffer * into the page list. Any data that lies beyond current position + "len" * bytes is moved into the XDR tail[]. The current pointer is then * repositioned at the beginning of the first XDR page. */ void xdr_enter_page(struct xdr_stream *xdr, unsigned int len) { len = xdr_align_pages(xdr, len); /* * Position current pointer at beginning of tail, and * set remaining message length. */ if (len != 0) xdr_set_page_base(xdr, 0, len); } EXPORT_SYMBOL_GPL(xdr_enter_page); static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf) { buf->head[0] = *iov; buf->tail[0] = empty_iov; buf->page_len = 0; buf->buflen = buf->len = iov->iov_len; } EXPORT_SYMBOL_GPL(xdr_buf_from_iov); /** * xdr_buf_subsegment - set subbuf to a portion of buf * @buf: an xdr buffer * @subbuf: the result buffer * @base: beginning of range in bytes * @len: length of range in bytes * * sets @subbuf to an xdr buffer representing the portion of @buf of * length @len starting at offset @base. * * @buf and @subbuf may be pointers to the same struct xdr_buf. * * Returns -1 if base or length are out of bounds. */ int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, unsigned int base, unsigned int len) { subbuf->buflen = subbuf->len = len; if (base < buf->head[0].iov_len) { subbuf->head[0].iov_base = buf->head[0].iov_base + base; subbuf->head[0].iov_len = min_t(unsigned int, len, buf->head[0].iov_len - base); len -= subbuf->head[0].iov_len; base = 0; } else { base -= buf->head[0].iov_len; subbuf->head[0].iov_base = buf->head[0].iov_base; subbuf->head[0].iov_len = 0; } if (base < buf->page_len) { subbuf->page_len = min(buf->page_len - base, len); base += buf->page_base; subbuf->page_base = base & ~PAGE_MASK; subbuf->pages = &buf->pages[base >> PAGE_SHIFT]; len -= subbuf->page_len; base = 0; } else { base -= buf->page_len; subbuf->pages = buf->pages; subbuf->page_base = 0; subbuf->page_len = 0; } if (base < buf->tail[0].iov_len) { subbuf->tail[0].iov_base = buf->tail[0].iov_base + base; subbuf->tail[0].iov_len = min_t(unsigned int, len, buf->tail[0].iov_len - base); len -= subbuf->tail[0].iov_len; base = 0; } else { base -= buf->tail[0].iov_len; subbuf->tail[0].iov_base = buf->tail[0].iov_base; subbuf->tail[0].iov_len = 0; } if (base || len) return -1; return 0; } EXPORT_SYMBOL_GPL(xdr_buf_subsegment); /** * xdr_stream_subsegment - set @subbuf to a portion of @xdr * @xdr: an xdr_stream set up for decoding * @subbuf: the result buffer * @nbytes: length of @xdr to extract, in bytes * * Sets up @subbuf to represent a portion of @xdr. The portion * starts at the current offset in @xdr, and extends for a length * of @nbytes. If this is successful, @xdr is advanced to the next * XDR data item following that portion. * * Return values: * %true: @subbuf has been initialized, and @xdr has been advanced. * %false: a bounds error has occurred */ bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, unsigned int nbytes) { unsigned int start = xdr_stream_pos(xdr); unsigned int remaining, len; /* Extract @subbuf and bounds-check the fn arguments */ if (xdr_buf_subsegment(xdr->buf, subbuf, start, nbytes)) return false; /* Advance @xdr by @nbytes */ for (remaining = nbytes; remaining;) { if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr)) return false; len = (char *)xdr->end - (char *)xdr->p; if (remaining <= len) { xdr->p = (__be32 *)((char *)xdr->p + (remaining + xdr_pad_size(nbytes))); break; } xdr->p = (__be32 *)((char *)xdr->p + len); xdr->end = xdr->p; remaining -= len; } xdr_stream_set_pos(xdr, start + nbytes); return true; } EXPORT_SYMBOL_GPL(xdr_stream_subsegment); /** * xdr_stream_move_subsegment - Move part of a stream to another position * @xdr: the source xdr_stream * @offset: the source offset of the segment * @target: the target offset of the segment * @length: the number of bytes to move * * Moves @length bytes from @offset to @target in the xdr_stream, overwriting * anything in its space. Returns the number of bytes in the segment. */ unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset, unsigned int target, unsigned int length) { struct xdr_buf buf; unsigned int shift; if (offset < target) { shift = target - offset; if (xdr_buf_subsegment(xdr->buf, &buf, offset, shift + length) < 0) return 0; xdr_buf_head_shift_right(&buf, 0, length, shift); } else if (offset > target) { shift = offset - target; if (xdr_buf_subsegment(xdr->buf, &buf, target, shift + length) < 0) return 0; xdr_buf_head_shift_left(&buf, shift, length, shift); } return length; } EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment); /** * xdr_stream_zero - zero out a portion of an xdr_stream * @xdr: an xdr_stream to zero out * @offset: the starting point in the stream * @length: the number of bytes to zero */ unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset, unsigned int length) { struct xdr_buf buf; if (xdr_buf_subsegment(xdr->buf, &buf, offset, length) < 0) return 0; if (buf.head[0].iov_len) xdr_buf_iov_zero(buf.head, 0, buf.head[0].iov_len); if (buf.page_len > 0) xdr_buf_pages_zero(&buf, 0, buf.page_len); if (buf.tail[0].iov_len) xdr_buf_iov_zero(buf.tail, 0, buf.tail[0].iov_len); return length; } EXPORT_SYMBOL_GPL(xdr_stream_zero); /** * xdr_buf_trim - lop at most "len" bytes off the end of "buf" * @buf: buf to be trimmed * @len: number of bytes to reduce "buf" by * * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note * that it's possible that we'll trim less than that amount if the xdr_buf is * too small, or if (for instance) it's all in the head and the parser has * already read too far into it. */ void xdr_buf_trim(struct xdr_buf *buf, unsigned int len) { size_t cur; unsigned int trim = len; if (buf->tail[0].iov_len) { cur = min_t(size_t, buf->tail[0].iov_len, trim); buf->tail[0].iov_len -= cur; trim -= cur; if (!trim) goto fix_len; } if (buf->page_len) { cur = min_t(unsigned int, buf->page_len, trim); buf->page_len -= cur; trim -= cur; if (!trim) goto fix_len; } if (buf->head[0].iov_len) { cur = min_t(size_t, buf->head[0].iov_len, trim); buf->head[0].iov_len -= cur; trim -= cur; } fix_len: buf->len -= (len - trim); } EXPORT_SYMBOL_GPL(xdr_buf_trim); static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf, void *obj, unsigned int len) { unsigned int this_len; this_len = min_t(unsigned int, len, subbuf->head[0].iov_len); memcpy(obj, subbuf->head[0].iov_base, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); memcpy(obj, subbuf->tail[0].iov_base, this_len); } /* obj is assumed to point to allocated memory of size at least len: */ int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) { struct xdr_buf subbuf; int status; status = xdr_buf_subsegment(buf, &subbuf, base, len); if (status != 0) return status; __read_bytes_from_xdr_buf(&subbuf, obj, len); return 0; } EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf); static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf, void *obj, unsigned int len) { unsigned int this_len; this_len = min_t(unsigned int, len, subbuf->head[0].iov_len); memcpy(subbuf->head[0].iov_base, obj, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); memcpy(subbuf->tail[0].iov_base, obj, this_len); } /* obj is assumed to point to allocated memory of size at least len: */ int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) { struct xdr_buf subbuf; int status; status = xdr_buf_subsegment(buf, &subbuf, base, len); if (status != 0) return status; __write_bytes_to_xdr_buf(&subbuf, obj, len); return 0; } EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj) { __be32 raw; int status; status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); if (status) return status; *obj = be32_to_cpu(raw); return 0; } EXPORT_SYMBOL_GPL(xdr_decode_word); int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj) { __be32 raw = cpu_to_be32(obj); return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj)); } EXPORT_SYMBOL_GPL(xdr_encode_word); /* Returns 0 on success, or else a negative error code. */ static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc, int encode) { char *elem = NULL, *c; unsigned int copied = 0, todo, avail_here; struct page **ppages = NULL; int err; if (encode) { if (xdr_encode_word(buf, base, desc->array_len) != 0) return -EINVAL; } else { if (xdr_decode_word(buf, base, &desc->array_len) != 0 || desc->array_len > desc->array_maxlen || (unsigned long) base + 4 + desc->array_len * desc->elem_size > buf->len) return -EINVAL; } base += 4; if (!desc->xcode) return 0; todo = desc->array_len * desc->elem_size; /* process head */ if (todo && base < buf->head->iov_len) { c = buf->head->iov_base + base; avail_here = min_t(unsigned int, todo, buf->head->iov_len - base); todo -= avail_here; while (avail_here >= desc->elem_size) { err = desc->xcode(desc, c); if (err) goto out; c += desc->elem_size; avail_here -= desc->elem_size; } if (avail_here) { if (!elem) { elem = kmalloc(desc->elem_size, GFP_KERNEL); err = -ENOMEM; if (!elem) goto out; } if (encode) { err = desc->xcode(desc, elem); if (err) goto out; memcpy(c, elem, avail_here); } else memcpy(elem, c, avail_here); copied = avail_here; } base = buf->head->iov_len; /* align to start of pages */ } /* process pages array */ base -= buf->head->iov_len; if (todo && base < buf->page_len) { unsigned int avail_page; avail_here = min(todo, buf->page_len - base); todo -= avail_here; base += buf->page_base; ppages = buf->pages + (base >> PAGE_SHIFT); base &= ~PAGE_MASK; avail_page = min_t(unsigned int, PAGE_SIZE - base, avail_here); c = kmap(*ppages) + base; while (avail_here) { avail_here -= avail_page; if (copied || avail_page < desc->elem_size) { unsigned int l = min(avail_page, desc->elem_size - copied); if (!elem) { elem = kmalloc(desc->elem_size, GFP_KERNEL); err = -ENOMEM; if (!elem) goto out; } if (encode) { if (!copied) { err = desc->xcode(desc, elem); if (err) goto out; } memcpy(c, elem + copied, l); copied += l; if (copied == desc->elem_size) copied = 0; } else { memcpy(elem + copied, c, l); copied += l; if (copied == desc->elem_size) { err = desc->xcode(desc, elem); if (err) goto out; copied = 0; } } avail_page -= l; c += l; } while (avail_page >= desc->elem_size) { err = desc->xcode(desc, c); if (err) goto out; c += desc->elem_size; avail_page -= desc->elem_size; } if (avail_page) { unsigned int l = min(avail_page, desc->elem_size - copied); if (!elem) { elem = kmalloc(desc->elem_size, GFP_KERNEL); err = -ENOMEM; if (!elem) goto out; } if (encode) { if (!copied) { err = desc->xcode(desc, elem); if (err) goto out; } memcpy(c, elem + copied, l); copied += l; if (copied == desc->elem_size) copied = 0; } else { memcpy(elem + copied, c, l); copied += l; if (copied == desc->elem_size) { err = desc->xcode(desc, elem); if (err) goto out; copied = 0; } } } if (avail_here) { kunmap(*ppages); ppages++; c = kmap(*ppages); } avail_page = min(avail_here, (unsigned int) PAGE_SIZE); } base = buf->page_len; /* align to start of tail */ } /* process tail */ base -= buf->page_len; if (todo) { c = buf->tail->iov_base + base; if (copied) { unsigned int l = desc->elem_size - copied; if (encode) memcpy(c, elem + copied, l); else { memcpy(elem + copied, c, l); err = desc->xcode(desc, elem); if (err) goto out; } todo -= l; c += l; } while (todo) { err = desc->xcode(desc, c); if (err) goto out; c += desc->elem_size; todo -= desc->elem_size; } } err = 0; out: kfree(elem); if (ppages) kunmap(*ppages); return err; } int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc) { if (base >= buf->len) return -EINVAL; return xdr_xcode_array2(buf, base, desc, 0); } EXPORT_SYMBOL_GPL(xdr_decode_array2); int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc) { if ((unsigned long) base + 4 + desc->array_len * desc->elem_size > buf->head->iov_len + buf->page_len + buf->tail->iov_len) return -EINVAL; return xdr_xcode_array2(buf, base, desc, 1); } EXPORT_SYMBOL_GPL(xdr_encode_array2); int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data) { int i, ret = 0; unsigned int page_len, thislen, page_offset; struct scatterlist sg[1]; sg_init_table(sg, 1); if (offset >= buf->head[0].iov_len) { offset -= buf->head[0].iov_len; } else { thislen = buf->head[0].iov_len - offset; if (thislen > len) thislen = len; sg_set_buf(sg, buf->head[0].iov_base + offset, thislen); ret = actor(sg, data); if (ret) goto out; offset = 0; len -= thislen; } if (len == 0) goto out; if (offset >= buf->page_len) { offset -= buf->page_len; } else { page_len = buf->page_len - offset; if (page_len > len) page_len = len; len -= page_len; page_offset = (offset + buf->page_base) & (PAGE_SIZE - 1); i = (offset + buf->page_base) >> PAGE_SHIFT; thislen = PAGE_SIZE - page_offset; do { if (thislen > page_len) thislen = page_len; sg_set_page(sg, buf->pages[i], thislen, page_offset); ret = actor(sg, data); if (ret) goto out; page_len -= thislen; i++; page_offset = 0; thislen = PAGE_SIZE; } while (page_len != 0); offset = 0; } if (len == 0) goto out; if (offset < buf->tail[0].iov_len) { thislen = buf->tail[0].iov_len - offset; if (thislen > len) thislen = len; sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen); ret = actor(sg, data); len -= thislen; } if (len != 0) ret = -EINVAL; out: return ret; } EXPORT_SYMBOL_GPL(xdr_process_buf); /** * xdr_stream_decode_string_dup - Decode and duplicate variable length string * @xdr: pointer to xdr_stream * @str: location to store pointer to string * @maxlen: maximum acceptable string length * @gfp_flags: GFP mask to use * * Return values: * On success, returns length of NUL-terminated string stored in *@ptr * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the string would exceed @maxlen * %-ENOMEM on memory allocation failure */ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, size_t maxlen, gfp_t gfp_flags) { void *p; ssize_t ret; ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); if (ret > 0) { char *s = kmemdup_nul(p, ret, gfp_flags); if (s != NULL) { *str = s; return strlen(s); } ret = -ENOMEM; } *str = NULL; return ret; } EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup); /** * xdr_stream_decode_opaque_auth - Decode struct opaque_auth (RFC5531 S8.2) * @xdr: pointer to xdr_stream * @flavor: location to store decoded flavor * @body: location to store decode body * @body_len: location to store length of decoded body * * Return values: * On success, returns the number of buffer bytes consumed * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the decoded size of the body field exceeds 400 octets */ ssize_t xdr_stream_decode_opaque_auth(struct xdr_stream *xdr, u32 *flavor, void **body, unsigned int *body_len) { ssize_t ret, len; len = xdr_stream_decode_u32(xdr, flavor); if (unlikely(len < 0)) return len; ret = xdr_stream_decode_opaque_inline(xdr, body, RPC_MAX_AUTH_SIZE); if (unlikely(ret < 0)) return ret; *body_len = ret; return len + ret; } EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_auth); /** * xdr_stream_encode_opaque_auth - Encode struct opaque_auth (RFC5531 S8.2) * @xdr: pointer to xdr_stream * @flavor: verifier flavor to encode * @body: content of body to encode * @body_len: length of body to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of @body exceeds 400 octets */ ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, void *body, unsigned int body_len) { ssize_t ret, len; if (unlikely(body_len > RPC_MAX_AUTH_SIZE)) return -EMSGSIZE; len = xdr_stream_encode_u32(xdr, flavor); if (unlikely(len < 0)) return len; ret = xdr_stream_encode_opaque(xdr, body, body_len); if (unlikely(ret < 0)) return ret; return len + ret; } EXPORT_SYMBOL_GPL(xdr_stream_encode_opaque_auth);
58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> #include <linux/kernel.h> #include <vdso/bits.h> #include <vdso/page.h> static inline void tlb_flush(struct mmu_gather *tlb) { unsigned long start = 0UL, end = TLB_FLUSH_ALL; unsigned int stride_shift = tlb_get_unmap_shift(tlb); if (!tlb->fullmm && !tlb->need_flush_all) { start = tlb->start; end = tlb->end; } flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } static inline void invlpg(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } enum addr_stride { PTE_STRIDE = 0, PMD_STRIDE = 1 }; /* * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination * of the three. For example: * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address * - FLAG_PCID: invalidate all TLB entries matching the PCID * * The first is used to invalidate (kernel) mappings at a particular * address across all processes. * * The latter invalidates all TLB entries matching a PCID. */ #define INVLPGB_FLAG_VA BIT(0) #define INVLPGB_FLAG_PCID BIT(1) #define INVLPGB_FLAG_ASID BIT(2) #define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) #define INVLPGB_FLAG_FINAL_ONLY BIT(4) #define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) /* The implied mode when all bits are clear: */ #define INVLPGB_MODE_ALL_NONGLOBALS 0UL #ifdef CONFIG_BROADCAST_TLB_FLUSH /* * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. * * The INVLPGB instruction is weakly ordered, and a batch of invalidations can * be done in a parallel fashion. * * The instruction takes the number of extra pages to invalidate, beyond the * first page, while __invlpgb gets the more human readable number of pages to * invalidate. * * The bits in rax[0:2] determine respectively which components of the address * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* * address in the specified range matches. * * Since it is desired to only flush TLB entries for the ASID that is executing * the instruction (a host/hypervisor or a guest), the ASID valid bit should * always be set. On a host/hypervisor, the hardware will use the ASID value * specified in EDX[15:0] (which should be 0). On a guest, the hardware will * use the actual ASID value of the guest. * * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from * this CPU have completed. */ static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, u16 nr_pages, enum addr_stride stride, u8 flags) { u64 rax = addr | flags | INVLPGB_FLAG_ASID; u32 ecx = (stride << 31) | (nr_pages - 1); u32 edx = (pcid << 16) | asid; /* The low bits in rax are for flags. Verify addr is clean. */ VM_WARN_ON_ONCE(addr & ~PAGE_MASK); /* INVLPGB; supported in binutils >= 2.36. */ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx)); } static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { __invlpgb(asid, pcid, 0, 1, 0, flags); } static inline void __tlbsync(void) { /* * TLBSYNC waits for INVLPGB instructions originating on the same CPU * to have completed. Print a warning if the task has been migrated, * and might not be waiting on all the INVLPGBs issued during this TLB * invalidation sequence. */ cant_migrate(); /* TLBSYNC: supported in binutils >= 0.36. */ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); } #else /* Some compilers (I'm looking at you clang!) simply can't do DCE */ static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, u16 nr_pages, enum addr_stride s, u8 flags) { } static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { } static inline void __tlbsync(void) { } #endif static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, unsigned long addr, u16 nr, bool stride) { enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE; u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA; __invlpgb(0, pcid, addr, nr, str, flags); } /* Flush all mappings for a given PCID, not including globals. */ static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) { __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID); } /* Flush all mappings, including globals, for all PCIDs. */ static inline void invlpgb_flush_all(void) { /* * TLBSYNC at the end needs to make sure all flushes done on the * current CPU have been executed system-wide. Therefore, make * sure nothing gets migrated in-between but disable preemption * as it is cheaper. */ guard(preempt)(); __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL); __tlbsync(); } /* Flush addr, including globals, for all PCIDs. */ static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) { __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL); } /* Flush all mappings for all PCIDs except globals. */ static inline void invlpgb_flush_all_nonglobals(void) { guard(preempt)(); __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS); __tlbsync(); } #endif /* _ASM_X86_TLB_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 David S. Miller (davem@redhat.com) * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> * and Nettle, by Niels Möller. */ #ifndef _CRYPTO_INTERNAL_CIPHER_H #define _CRYPTO_INTERNAL_CIPHER_H #include <crypto/algapi.h> struct crypto_cipher { struct crypto_tfm base; }; /** * DOC: Single Block Cipher API * * The single block cipher API is used with the ciphers of type * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto). * * Using the single block cipher API calls, operations with the basic cipher * primitive can be implemented. These cipher primitives exclude any block * chaining operations including IV handling. * * The purpose of this single block cipher API is to support the implementation * of templates or other concepts that only need to perform the cipher operation * on one block at a time. Templates invoke the underlying cipher primitive * block-wise and process either the input or the output data of these cipher * operations. */ static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm) { return (struct crypto_cipher *)tfm; } /** * crypto_alloc_cipher() - allocate single block cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * single block cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a single block cipher. The returned struct * crypto_cipher is the cipher handle that is required for any subsequent API * invocation for that single block cipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_CIPHER; mask |= CRYPTO_ALG_TYPE_MASK; return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask)); } static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm) { return &tfm->base; } /** * crypto_free_cipher() - zeroize and free the single block cipher handle * @tfm: cipher handle to be freed */ static inline void crypto_free_cipher(struct crypto_cipher *tfm) { crypto_free_tfm(crypto_cipher_tfm(tfm)); } /** * crypto_has_cipher() - Search for the availability of a single block cipher * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * single block cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Return: true when the single block cipher is known to the kernel crypto API; * false otherwise */ static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_CIPHER; mask |= CRYPTO_ALG_TYPE_MASK; return crypto_has_alg(alg_name, type, mask); } /** * crypto_cipher_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the single block cipher referenced with the cipher handle * tfm is returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm) { return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm)); } static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm) { return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm)); } static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm) { return crypto_tfm_get_flags(crypto_cipher_tfm(tfm)); } static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags); } static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags); } /** * crypto_cipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the single block cipher referenced by the * cipher handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_cipher_setkey(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen); /** * crypto_cipher_encrypt_one() - encrypt one block of plaintext * @tfm: cipher handle * @dst: points to the buffer that will be filled with the ciphertext * @src: buffer holding the plaintext to be encrypted * * Invoke the encryption operation of one block. The caller must ensure that * the plaintext and ciphertext buffers are at least one block in size. */ void crypto_cipher_encrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src); /** * crypto_cipher_decrypt_one() - decrypt one block of ciphertext * @tfm: cipher handle * @dst: points to the buffer that will be filled with the plaintext * @src: buffer holding the ciphertext to be decrypted * * Invoke the decryption operation of one block. The caller must ensure that * the plaintext and ciphertext buffers are at least one block in size. */ void crypto_cipher_decrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src); struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher); struct crypto_cipher_spawn { struct crypto_spawn base; }; static inline int crypto_grab_cipher(struct crypto_cipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_CIPHER; mask |= CRYPTO_ALG_TYPE_MASK; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } static inline void crypto_drop_cipher(struct crypto_cipher_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct crypto_alg *crypto_spawn_cipher_alg( struct crypto_cipher_spawn *spawn) { return spawn->base.alg; } static inline struct crypto_cipher *crypto_spawn_cipher( struct crypto_cipher_spawn *spawn) { u32 type = CRYPTO_ALG_TYPE_CIPHER; u32 mask = CRYPTO_ALG_TYPE_MASK; return __crypto_cipher_cast(crypto_spawn_tfm(&spawn->base, type, mask)); } static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm) { return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher; } #endif
16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0+ /* * IMA support for appraising module-style appended signatures. * * Copyright (C) 2019 IBM Corporation * * Author: * Thiago Jung Bauermann <bauerman@linux.ibm.com> */ #include <linux/types.h> #include <linux/module_signature.h> #include <keys/asymmetric-type.h> #include <crypto/pkcs7.h> #include "ima.h" struct modsig { struct pkcs7_message *pkcs7_msg; enum hash_algo hash_algo; /* This digest will go in the 'd-modsig' field of the IMA template. */ const u8 *digest; u32 digest_size; /* * This is what will go to the measurement list if the template requires * storing the signature. */ int raw_pkcs7_len; u8 raw_pkcs7[] __counted_by(raw_pkcs7_len); }; /* * ima_read_modsig - Read modsig from buf. * * Return: 0 on success, error code otherwise. */ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig) { const size_t marker_len = strlen(MODULE_SIG_STRING); const struct module_signature *sig; struct modsig *hdr; size_t sig_len; const void *p; int rc; if (buf_len <= marker_len + sizeof(*sig)) return -ENOENT; p = buf + buf_len - marker_len; if (memcmp(p, MODULE_SIG_STRING, marker_len)) return -ENOENT; buf_len -= marker_len; sig = (const struct module_signature *)(p - sizeof(*sig)); rc = mod_check_sig(sig, buf_len, func_tokens[func]); if (rc) return rc; sig_len = be32_to_cpu(sig->sig_len); buf_len -= sig_len + sizeof(*sig); /* Allocate sig_len additional bytes to hold the raw PKCS#7 data. */ hdr = kzalloc(struct_size(hdr, raw_pkcs7, sig_len), GFP_KERNEL); if (!hdr) return -ENOMEM; hdr->raw_pkcs7_len = sig_len; hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len); if (IS_ERR(hdr->pkcs7_msg)) { rc = PTR_ERR(hdr->pkcs7_msg); kfree(hdr); return rc; } memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len); /* We don't know the hash algorithm yet. */ hdr->hash_algo = HASH_ALGO__LAST; *modsig = hdr; return 0; } /** * ima_collect_modsig - Calculate the file hash without the appended signature. * @modsig: parsed module signature * @buf: data to verify the signature on * @size: data size * * Since the modsig is part of the file contents, the hash used in its signature * isn't the same one ordinarily calculated by IMA. Therefore PKCS7 code * calculates a separate one for signature verification. */ void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) { int rc; /* * Provide the file contents (minus the appended sig) so that the PKCS7 * code can calculate the file hash. */ size -= modsig->raw_pkcs7_len + strlen(MODULE_SIG_STRING) + sizeof(struct module_signature); rc = pkcs7_supply_detached_data(modsig->pkcs7_msg, buf, size); if (rc) return; /* Ask the PKCS7 code to calculate the file hash. */ rc = pkcs7_get_digest(modsig->pkcs7_msg, &modsig->digest, &modsig->digest_size, &modsig->hash_algo); } int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) { return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring, VERIFYING_MODULE_SIGNATURE, NULL, NULL); } int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size) { *algo = modsig->hash_algo; *digest = modsig->digest; *digest_size = modsig->digest_size; return 0; } int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len) { *data = &modsig->raw_pkcs7; *data_len = modsig->raw_pkcs7_len; return 0; } void ima_free_modsig(struct modsig *modsig) { if (!modsig) return; pkcs7_free_message(modsig->pkcs7_msg); kfree(modsig); }
30 3 10 17 1 2 2 1 2 4 1 3 12 23 3 5 15 15 5 5 5 5 5 5 5 10 5 1 4 2 9 3 6 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/capability.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/netfilter/nfnetlink_cthelper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers"); struct nfnl_cthelper { struct list_head list; struct nf_conntrack_helper helper; }; static LIST_HEAD(nfnl_cthelper_list); static int nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conn_help *help; struct nf_conntrack_helper *helper; help = nfct_help(ct); if (help == NULL) return NF_DROP; /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (helper == NULL) return NF_DROP; /* This is a user-space helper not yet configured, skip. */ if ((helper->flags & (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == NF_CT_HELPER_F_USERSPACE) return NF_ACCEPT; /* If the user-space helper is not available, don't block traffic. */ return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS; } static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = { [NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, }, [NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, }, }; static int nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, const struct nlattr *attr) { int err; struct nlattr *tb[NFCTH_TUPLE_MAX+1]; err = nla_parse_nested_deprecated(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) return -EINVAL; /* Not all fields are initialized so first zero the tuple */ memset(tuple, 0, sizeof(struct nf_conntrack_tuple)); tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); return 0; } static int nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); const struct nf_conntrack_helper *helper; if (attr == NULL) return -EINVAL; helper = rcu_dereference(help->helper); if (!helper || helper->data_len == 0) return -EINVAL; nla_memcpy(help->data, attr, sizeof(help->data)); return 0; } static int nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct) { const struct nf_conn_help *help = nfct_help(ct); const struct nf_conntrack_helper *helper; helper = rcu_dereference(help->helper); if (helper && helper->data_len && nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data)) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = { [NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, }, [NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, }, }; static int nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, const struct nlattr *attr) { int err; struct nlattr *tb[NFCTH_POLICY_MAX+1]; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_NAME] || !tb[NFCTH_POLICY_EXPECT_MAX] || !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; nla_strscpy(expect_policy->name, tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; expect_policy->timeout = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); return 0; } static const struct nla_policy nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = { [NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, }, }; static int nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, const struct nlattr *attr) { int i, ret; struct nf_conntrack_expect_policy *expect_policy; struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; unsigned int class_max; ret = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr, nfnl_cthelper_expect_policy_set, NULL); if (ret < 0) return ret; if (!tb[NFCTH_POLICY_SET_NUM]) return -EINVAL; class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); if (class_max == 0) return -EINVAL; if (class_max > NF_CT_MAX_EXPECT_CLASSES) return -EOVERFLOW; expect_policy = kcalloc(class_max, sizeof(struct nf_conntrack_expect_policy), GFP_KERNEL); if (expect_policy == NULL) return -ENOMEM; for (i = 0; i < class_max; i++) { if (!tb[NFCTH_POLICY_SET+i]) goto err; ret = nfnl_cthelper_expect_policy(&expect_policy[i], tb[NFCTH_POLICY_SET+i]); if (ret < 0) goto err; } helper->expect_class_max = class_max - 1; helper->expect_policy = expect_policy; return 0; err: kfree(expect_policy); return -EINVAL; } static int nfnl_cthelper_create(const struct nlattr * const tb[], struct nf_conntrack_tuple *tuple) { struct nf_conntrack_helper *helper; struct nfnl_cthelper *nfcth; unsigned int size; int ret; if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) return -EINVAL; nfcth = kzalloc(sizeof(*nfcth), GFP_KERNEL); if (nfcth == NULL) return -ENOMEM; helper = &nfcth->helper; ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]); if (ret < 0) goto err1; nla_strscpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > sizeof_field(struct nf_conn_help, data)) { ret = -ENOMEM; goto err2; } helper->data_len = size; helper->flags |= NF_CT_HELPER_F_USERSPACE; memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); helper->me = THIS_MODULE; helper->help = nfnl_userspace_cthelper; helper->from_nlattr = nfnl_cthelper_from_nlattr; helper->to_nlattr = nfnl_cthelper_to_nlattr; /* Default to queue number zero, this can be updated at any time. */ if (tb[NFCTH_QUEUE_NUM]) helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); if (tb[NFCTH_STATUS]) { int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); switch(status) { case NFCT_HELPER_STATUS_ENABLED: helper->flags |= NF_CT_HELPER_F_CONFIGURED; break; case NFCT_HELPER_STATUS_DISABLED: helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; break; } } ret = nf_conntrack_helper_register(helper); if (ret < 0) goto err2; list_add_tail(&nfcth->list, &nfnl_cthelper_list); return 0; err2: kfree(helper->expect_policy); err1: kfree(nfcth); return ret; } static int nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy, struct nf_conntrack_expect_policy *new_policy, const struct nlattr *attr) { struct nlattr *tb[NFCTH_POLICY_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_NAME] || !tb[NFCTH_POLICY_EXPECT_MAX] || !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; if (nla_strcmp(tb[NFCTH_POLICY_NAME], policy->name)) return -EBUSY; new_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); if (new_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; new_policy->timeout = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); return 0; } static int nfnl_cthelper_update_policy_all(struct nlattr *tb[], struct nf_conntrack_helper *helper) { struct nf_conntrack_expect_policy *new_policy; struct nf_conntrack_expect_policy *policy; int i, ret = 0; new_policy = kmalloc_array(helper->expect_class_max + 1, sizeof(*new_policy), GFP_KERNEL); if (!new_policy) return -ENOMEM; /* Check first that all policy attributes are well-formed, so we don't * leave things in inconsistent state on errors. */ for (i = 0; i < helper->expect_class_max + 1; i++) { if (!tb[NFCTH_POLICY_SET + i]) { ret = -EINVAL; goto err; } ret = nfnl_cthelper_update_policy_one(&helper->expect_policy[i], &new_policy[i], tb[NFCTH_POLICY_SET + i]); if (ret < 0) goto err; } /* Now we can safely update them. */ for (i = 0; i < helper->expect_class_max + 1; i++) { policy = (struct nf_conntrack_expect_policy *) &helper->expect_policy[i]; policy->max_expected = new_policy->max_expected; policy->timeout = new_policy->timeout; } err: kfree(new_policy); return ret; } static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper, const struct nlattr *attr) { struct nlattr *tb[NFCTH_POLICY_SET_MAX + 1]; unsigned int class_max; int err; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr, nfnl_cthelper_expect_policy_set, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_SET_NUM]) return -EINVAL; class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); if (helper->expect_class_max + 1 != class_max) return -EBUSY; return nfnl_cthelper_update_policy_all(tb, helper); } static int nfnl_cthelper_update(const struct nlattr * const tb[], struct nf_conntrack_helper *helper) { u32 size; int ret; if (tb[NFCTH_PRIV_DATA_LEN]) { size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size != helper->data_len) return -EBUSY; } if (tb[NFCTH_POLICY]) { ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]); if (ret < 0) return ret; } if (tb[NFCTH_QUEUE_NUM]) helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); if (tb[NFCTH_STATUS]) { int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); switch(status) { case NFCT_HELPER_STATUS_ENABLED: helper->flags |= NF_CT_HELPER_F_CONFIGURED; break; case NFCT_HELPER_STATUS_DISABLED: helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; break; } } return 0; } static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; struct nf_conntrack_tuple tuple; struct nfnl_cthelper *nlcth; int ret = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; helper_name = nla_data(tb[NFCTH_NAME]); ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; list_for_each_entry(nlcth, &nfnl_cthelper_list, list) { cur = &nlcth->helper; if (strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if ((tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; helper = cur; break; } if (helper == NULL) ret = nfnl_cthelper_create(tb, &tuple); else ret = nfnl_cthelper_update(tb, helper); return ret; } static int nfnl_cthelper_dump_tuple(struct sk_buff *skb, struct nf_conntrack_helper *helper) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, NFCTH_TUPLE); if (nest_parms == NULL) goto nla_put_failure; if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM, htons(helper->tuple.src.l3num))) goto nla_put_failure; if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum)) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int nfnl_cthelper_dump_policy(struct sk_buff *skb, struct nf_conntrack_helper *helper) { int i; struct nlattr *nest_parms1, *nest_parms2; nest_parms1 = nla_nest_start(skb, NFCTH_POLICY); if (nest_parms1 == NULL) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, htonl(helper->expect_class_max + 1))) goto nla_put_failure; for (i = 0; i < helper->expect_class_max + 1; i++) { nest_parms2 = nla_nest_start(skb, (NFCTH_POLICY_SET + i)); if (nest_parms2 == NULL) goto nla_put_failure; if (nla_put_string(skb, NFCTH_POLICY_NAME, helper->expect_policy[i].name)) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX, htonl(helper->expect_policy[i].max_expected))) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT, htonl(helper->expect_policy[i].timeout))) goto nla_put_failure; nla_nest_end(skb, nest_parms2); } nla_nest_end(skb, nest_parms1); return 0; nla_put_failure: return -1; } static int nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_conntrack_helper *helper) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; int status; event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, NFCTH_NAME, helper->name)) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num))) goto nla_put_failure; if (nfnl_cthelper_dump_tuple(skb, helper) < 0) goto nla_put_failure; if (nfnl_cthelper_dump_policy(skb, helper) < 0) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len))) goto nla_put_failure; if (helper->flags & NF_CT_HELPER_F_CONFIGURED) status = NFCT_HELPER_STATUS_ENABLED; else status = NFCT_HELPER_STATUS_DISABLED; if (nla_put_be32(skb, NFCTH_STATUS, htonl(status))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_conntrack_helper *cur, *last; rcu_read_lock(); last = (struct nf_conntrack_helper *)cb->args[1]; for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[cb->args[0]], hnode) { /* skip non-userspace conntrack helpers. */ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) continue; if (cb->args[1]) { if (cur != last) continue; cb->args[1] = 0; } if (nfnl_cthelper_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; goto out; } } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } out: rcu_read_unlock(); return skb->len; } static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { int ret = -ENOENT; struct nf_conntrack_helper *cur; struct sk_buff *skb2; char *helper_name = NULL; struct nf_conntrack_tuple tuple; struct nfnl_cthelper *nlcth; bool tuple_set = false; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); if (tb[NFCTH_TUPLE]) { ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; tuple_set = true; } list_for_each_entry(nlcth, &nfnl_cthelper_list, list) { cur = &nlcth->helper; if (helper_name && strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if (tuple_set && (tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { char *helper_name = NULL; struct nf_conntrack_helper *cur; struct nf_conntrack_tuple tuple; bool tuple_set = false, found = false; struct nfnl_cthelper *nlcth, *n; int j = 0, ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); if (tb[NFCTH_TUPLE]) { ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; tuple_set = true; } ret = -ENOENT; list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; j++; if (helper_name && strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if (tuple_set && (tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; if (refcount_dec_if_one(&cur->refcnt)) { found = true; nf_conntrack_helper_unregister(cur); kfree(cur->expect_policy); list_del(&nlcth->list); kfree(nlcth); } else { ret = -EBUSY; } } /* Make sure we return success if we flush and there is no helpers */ return (found || j == 0) ? 0 : ret; } static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { [NFCTH_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, }, [NFCTH_STATUS] = { .type = NLA_U32, }, }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, }; static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { .name = "cthelper", .subsys_id = NFNL_SUBSYS_CTHELPER, .cb_count = NFNL_MSG_CTHELPER_MAX, .cb = nfnl_cthelper_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER); static int __init nfnl_cthelper_init(void) { int ret; ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys); if (ret < 0) { pr_err("nfnl_cthelper: cannot register with nfnetlink.\n"); goto err_out; } return 0; err_out: return ret; } static void __exit nfnl_cthelper_exit(void) { struct nf_conntrack_helper *cur; struct nfnl_cthelper *nlcth, *n; nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; nf_conntrack_helper_unregister(cur); kfree(cur->expect_policy); kfree(nlcth); } } module_init(nfnl_cthelper_init); module_exit(nfnl_cthelper_exit);
442 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 // SPDX-License-Identifier: GPL-2.0 /* * Creating audit events from TTY input. * * Copyright (C) 2007 Red Hat, Inc. All rights reserved. * * Authors: Miloslav Trmac <mitr@redhat.com> */ #include <linux/audit.h> #include <linux/slab.h> #include <linux/tty.h> #include "tty.h" #define TTY_AUDIT_BUF_SIZE 4096 struct tty_audit_buf { struct mutex mutex; /* Protects all data below */ dev_t dev; /* The TTY which the data is from */ bool icanon; size_t valid; u8 *data; /* Allocated size TTY_AUDIT_BUF_SIZE */ }; static struct tty_audit_buf *tty_audit_buf_ref(void) { struct tty_audit_buf *buf; buf = current->signal->tty_audit_buf; WARN_ON(buf == ERR_PTR(-ESRCH)); return buf; } static struct tty_audit_buf *tty_audit_buf_alloc(void) { struct tty_audit_buf *buf; buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) goto err; buf->data = kmalloc(TTY_AUDIT_BUF_SIZE, GFP_KERNEL); if (!buf->data) goto err_buf; mutex_init(&buf->mutex); return buf; err_buf: kfree(buf); err: return NULL; } static void tty_audit_buf_free(struct tty_audit_buf *buf) { WARN_ON(buf->valid != 0); kfree(buf->data); kfree(buf); } static void tty_audit_log(const char *description, dev_t dev, const u8 *data, size_t size) { struct audit_buffer *ab; pid_t pid = task_pid_nr(current); uid_t uid = from_kuid(&init_user_ns, task_uid(current)); uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); unsigned int sessionid = audit_get_sessionid(current); char name[TASK_COMM_LEN]; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_TTY); if (!ab) return; audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d minor=%d comm=", description, pid, uid, loginuid, sessionid, MAJOR(dev), MINOR(dev)); get_task_comm(name, current); audit_log_untrustedstring(ab, name); audit_log_format(ab, " data="); audit_log_n_hex(ab, data, size); audit_log_end(ab); } /* * tty_audit_buf_push - Push buffered data out * * Generate an audit message from the contents of @buf, which is owned by * the current task. @buf->mutex must be locked. */ static void tty_audit_buf_push(struct tty_audit_buf *buf) { if (buf->valid == 0) return; if (audit_enabled == AUDIT_OFF) { buf->valid = 0; return; } tty_audit_log("tty", buf->dev, buf->data, buf->valid); buf->valid = 0; } /** * tty_audit_exit - Handle a task exit * * Make sure all buffered data is written out and deallocate the buffer. * Only needs to be called if current->signal->tty_audit_buf != %NULL. * * The process is single-threaded at this point; no other threads share * current->signal. */ void tty_audit_exit(void) { struct tty_audit_buf *buf; buf = xchg(&current->signal->tty_audit_buf, ERR_PTR(-ESRCH)); if (!buf) return; tty_audit_buf_push(buf); tty_audit_buf_free(buf); } /* * tty_audit_fork - Copy TTY audit state for a new task * * Set up TTY audit state in @sig from current. @sig needs no locking. */ void tty_audit_fork(struct signal_struct *sig) { sig->audit_tty = current->signal->audit_tty; } /* * tty_audit_tiocsti - Log TIOCSTI */ void tty_audit_tiocsti(const struct tty_struct *tty, u8 ch) { dev_t dev; dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (tty_audit_push()) return; if (audit_enabled) tty_audit_log("ioctl=TIOCSTI", dev, &ch, 1); } /* * tty_audit_push - Flush current's pending audit data * * Returns 0 if success, -EPERM if tty audit is disabled */ int tty_audit_push(void) { struct tty_audit_buf *buf; if (~current->signal->audit_tty & AUDIT_TTY_ENABLE) return -EPERM; buf = tty_audit_buf_ref(); if (!IS_ERR_OR_NULL(buf)) { mutex_lock(&buf->mutex); tty_audit_buf_push(buf); mutex_unlock(&buf->mutex); } return 0; } /* * tty_audit_buf_get - Get an audit buffer. * * Get an audit buffer, allocate it if necessary. Return %NULL * if out of memory or ERR_PTR(-ESRCH) if tty_audit_exit() has already * occurred. Otherwise, return a new reference to the buffer. */ static struct tty_audit_buf *tty_audit_buf_get(void) { struct tty_audit_buf *buf; buf = tty_audit_buf_ref(); if (buf) return buf; buf = tty_audit_buf_alloc(); if (buf == NULL) { audit_log_lost("out of memory in TTY auditing"); return NULL; } /* Race to use this buffer, free it if another wins */ if (cmpxchg(&current->signal->tty_audit_buf, NULL, buf) != NULL) tty_audit_buf_free(buf); return tty_audit_buf_ref(); } /* * tty_audit_add_data - Add data for TTY auditing. * * Audit @data of @size from @tty, if necessary. */ void tty_audit_add_data(const struct tty_struct *tty, const void *data, size_t size) { struct tty_audit_buf *buf; unsigned int audit_tty; bool icanon = L_ICANON(tty); dev_t dev; audit_tty = READ_ONCE(current->signal->audit_tty); if (~audit_tty & AUDIT_TTY_ENABLE) return; if (unlikely(size == 0)) return; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) return; if ((~audit_tty & AUDIT_TTY_LOG_PASSWD) && icanon && !L_ECHO(tty)) return; buf = tty_audit_buf_get(); if (IS_ERR_OR_NULL(buf)) return; mutex_lock(&buf->mutex); dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (buf->dev != dev || buf->icanon != icanon) { tty_audit_buf_push(buf); buf->dev = dev; buf->icanon = icanon; } do { size_t run; run = TTY_AUDIT_BUF_SIZE - buf->valid; if (run > size) run = size; memcpy(buf->data + buf->valid, data, run); buf->valid += run; data += run; size -= run; if (buf->valid == TTY_AUDIT_BUF_SIZE) tty_audit_buf_push(buf); } while (size != 0); mutex_unlock(&buf->mutex); }
1059 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_DST_OPS_H #define _NET_DST_OPS_H #include <linux/types.h> #include <linux/percpu_counter.h> #include <linux/cache.h> struct dst_entry; struct kmem_cachep; struct net_device; struct sk_buff; struct sock; struct net; struct dst_ops { unsigned short family; unsigned int gc_thresh; void (*gc)(struct dst_ops *ops); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); unsigned int (*default_advmss)(const struct dst_entry *); unsigned int (*mtu)(const struct dst_entry *); u32 * (*cow_metrics)(struct dst_entry *, unsigned long); void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev); void (*negative_advice)(struct sock *sk, struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); void (*redirect)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); int (*local_out)(struct net *net, struct sock *sk, struct sk_buff *skb); struct neighbour * (*neigh_lookup)(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); void (*confirm_neigh)(const struct dst_entry *dst, const void *daddr); struct kmem_cache *kmem_cachep; struct percpu_counter pcpuc_entries ____cacheline_aligned_in_smp; }; static inline int dst_entries_get_fast(struct dst_ops *dst) { return percpu_counter_read_positive(&dst->pcpuc_entries); } static inline int dst_entries_get_slow(struct dst_ops *dst) { return percpu_counter_sum_positive(&dst->pcpuc_entries); } #define DST_PERCPU_COUNTER_BATCH 32 static inline void dst_entries_add(struct dst_ops *dst, int val) { percpu_counter_add_batch(&dst->pcpuc_entries, val, DST_PERCPU_COUNTER_BATCH); } static inline int dst_entries_init(struct dst_ops *dst) { return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); } static inline void dst_entries_destroy(struct dst_ops *dst) { percpu_counter_destroy(&dst->pcpuc_entries); } #endif
91 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * x86-optimized SHA-512 block function * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); #define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha512_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha512_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha512_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3); DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx); DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx); static void sha512_blocks(struct sha512_block_state *state, const u8 *data, size_t nblocks) { static_call(sha512_blocks_x86)(state, data, nblocks); } #define sha512_mod_init_arch sha512_mod_init_arch static void sha512_mod_init_arch(void) { if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha512_blocks_x86, sha512_blocks_avx2); else static_call_update(sha512_blocks_x86, sha512_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha512_blocks_x86, sha512_blocks_ssse3); } }
20 19 16 20 20 20 20 16 20 4 20 20 20 20 20 4 20 20 20 20 19 20 20 20 20 20 20 20 20 20 4 20 20 16 16 20 20 20 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 /* * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023 * * Design * ====== * * See https://www.chronox.de/jent.html * * License * ======= * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * This Jitterentropy RNG is based on the jitterentropy library * version 3.4.0 provided at https://www.chronox.de/jent.html */ #ifdef __OPTIMIZE__ #error "The CPU Jitter random number generator must not be compiled with optimizations. See documentation. Use the compiler switch -O0 for compiling jitterentropy.c." #endif typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { /* SHA3-256 is used as conditioner */ #define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ void *hash_state; /* SENSITIVE hash state entropy pool */ __u64 prev_time; /* SENSITIVE Previous time stamp */ __u64 last_delta; /* SENSITIVE stuck test */ __s64 last_delta2; /* SENSITIVE stuck test */ unsigned int flags; /* Flags used to initialize */ unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_ACCESSLOOPS 128 #define JENT_MEMORY_SIZE \ (CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS * \ CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE) unsigned char *mem; /* Memory access location with size of * memblocks * memblocksize */ unsigned int memlocation; /* Pointer to byte in *mem */ unsigned int memblocks; /* Number of memory blocks in *mem */ unsigned int memblocksize; /* Size of one memory block in bytes */ unsigned int memaccessloops; /* Number of memory accesses per random * bit generation */ /* Repetition Count Test */ unsigned int rct_count; /* Number of stuck values */ /* Adaptive Proportion Test cutoff values */ unsigned int apt_cutoff; /* Intermittent health test failure */ unsigned int apt_cutoff_permanent; /* Permanent health test failure */ #define JENT_APT_WINDOW_SIZE 512 /* Data window size */ /* LSB of time stamp to process */ #define JENT_APT_LSB 16 #define JENT_APT_WORD_MASK (JENT_APT_LSB - 1) unsigned int apt_observations; /* Number of collected observations */ unsigned int apt_count; /* APT counter */ unsigned int apt_base; /* APT base reference */ unsigned int health_failure; /* Record health failure */ unsigned int apt_base_set:1; /* APT base reference set? */ }; /* Flags that can be used to initialize the RNG */ #define JENT_DISABLE_MEMORY_ACCESS (1<<2) /* Disable memory access for more * entropy, saves MEMORY_SIZE RAM for * entropy collector */ /* -- error codes for init function -- */ #define JENT_ENOTIME 1 /* Timer service not available */ #define JENT_ECOARSETIME 2 /* Timer too coarse for RNG */ #define JENT_ENOMONOTONIC 3 /* Timer is not monotonic increasing */ #define JENT_EVARVAR 5 /* Timer does not produce variations of * variations (2nd derivation of time is * zero). */ #define JENT_ESTUCK 8 /* Too many stuck results during init. */ #define JENT_EHEALTH 9 /* Health test failed during initialization */ #define JENT_ERCT 10 /* RCT failed during initialization */ #define JENT_EHASH 11 /* Hash self test failed */ #define JENT_EMEM 12 /* Can't allocate memory for initialization */ #define JENT_RCT_FAILURE 1 /* Failure in RCT health test. */ #define JENT_APT_FAILURE 2 /* Failure in APT health test. */ #define JENT_PERMANENT_FAILURE_SHIFT 16 #define JENT_PERMANENT_FAILURE(x) (x << JENT_PERMANENT_FAILURE_SHIFT) #define JENT_RCT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_RCT_FAILURE) #define JENT_APT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_APT_FAILURE) /* * The output n bits can receive more than n bits of min entropy, of course, * but the fixed output of the conditioning function can only asymptotically * approach the output size bits of min entropy, not attain that bound. Random * maps will tend to have output collisions, which reduces the creditable * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). * * The value "64" is justified in Appendix A.4 of the current 90C draft, * and aligns with NIST's in "epsilon" definition in this document, which is * that a string can be considered "full entropy" if you can bound the min * entropy in each bit of output to at least 1-epsilon, where epsilon is * required to be <= 2^(-32). */ #define JENT_ENTROPY_SAFETY_FACTOR 64 #include <linux/array_size.h> #include <linux/fips.h> #include <linux/minmax.h> #include "jitterentropy.h" /*************************************************************************** * Adaptive Proportion Test * * This test complies with SP800-90B section 4.4.2. ***************************************************************************/ /* * See the SP 800-90B comment #10b for the corrected cutoff for the SP 800-90B * APT. * https://www.untruth.org/~josh/sp80090b/UL%20SP800-90B-final%20comments%20v1.9%2020191212.pdf * In the syntax of R, this is C = 2 + qbinom(1 − 2^(−30), 511, 2^(-1/osr)). * (The original formula wasn't correct because the first symbol must * necessarily have been observed, so there is no chance of observing 0 of these * symbols.) * * For the alpha < 2^-53, R cannot be used as it uses a float data type without * arbitrary precision. A SageMath script is used to calculate those cutoff * values. * * For any value above 14, this yields the maximal allowable value of 512 * (by FIPS 140-2 IG 7.19 Resolution # 16, we cannot choose a cutoff value that * renders the test unable to fail). */ static const unsigned int jent_apt_cutoff_lookup[15] = { 325, 422, 459, 477, 488, 494, 499, 502, 505, 507, 508, 509, 510, 511, 512 }; static const unsigned int jent_apt_cutoff_permanent_lookup[15] = { 355, 447, 479, 494, 502, 507, 510, 512, 512, 512, 512, 512, 512, 512, 512 }; static void jent_apt_init(struct rand_data *ec, unsigned int osr) { /* * Establish the apt_cutoff based on the presumed entropy rate of * 1/osr. */ if (osr >= ARRAY_SIZE(jent_apt_cutoff_lookup)) { ec->apt_cutoff = jent_apt_cutoff_lookup[ ARRAY_SIZE(jent_apt_cutoff_lookup) - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[ ARRAY_SIZE(jent_apt_cutoff_permanent_lookup) - 1]; } else { ec->apt_cutoff = jent_apt_cutoff_lookup[osr - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[osr - 1]; } } /* * Reset the APT counter * * @ec [in] Reference to entropy collector */ static void jent_apt_reset(struct rand_data *ec, unsigned int delta_masked) { /* Reset APT counter */ ec->apt_count = 0; ec->apt_base = delta_masked; ec->apt_observations = 0; } /* * Insert a new entropy event into APT * * @ec [in] Reference to entropy collector * @delta_masked [in] Masked time delta to process */ static void jent_apt_insert(struct rand_data *ec, unsigned int delta_masked) { /* Initialize the base reference */ if (!ec->apt_base_set) { ec->apt_base = delta_masked; ec->apt_base_set = 1; return; } if (delta_masked == ec->apt_base) { ec->apt_count++; /* Note, ec->apt_count starts with one. */ if (ec->apt_count >= ec->apt_cutoff_permanent) ec->health_failure |= JENT_APT_FAILURE_PERMANENT; else if (ec->apt_count >= ec->apt_cutoff) ec->health_failure |= JENT_APT_FAILURE; } ec->apt_observations++; if (ec->apt_observations >= JENT_APT_WINDOW_SIZE) jent_apt_reset(ec, delta_masked); } /*************************************************************************** * Stuck Test and its use as Repetition Count Test * * The Jitter RNG uses an enhanced version of the Repetition Count Test * (RCT) specified in SP800-90B section 4.4.1. Instead of counting identical * back-to-back values, the input to the RCT is the counting of the stuck * values during the generation of one Jitter RNG output block. * * The RCT is applied with an alpha of 2^{-30} compliant to FIPS 140-2 IG 9.8. * * During the counting operation, the Jitter RNG always calculates the RCT * cut-off value of C. If that value exceeds the allowed cut-off value, * the Jitter RNG output block will be calculated completely but discarded at * the end. The caller of the Jitter RNG is informed with an error code. ***************************************************************************/ /* * Repetition Count Test as defined in SP800-90B section 4.4.1 * * @ec [in] Reference to entropy collector * @stuck [in] Indicator whether the value is stuck */ static void jent_rct_insert(struct rand_data *ec, int stuck) { if (stuck) { ec->rct_count++; /* * The cutoff value is based on the following consideration: * alpha = 2^-30 or 2^-60 as recommended in SP800-90B. * In addition, we require an entropy value H of 1/osr as this * is the minimum entropy required to provide full entropy. * Note, we collect (DATA_SIZE_BITS + ENTROPY_SAFETY_FACTOR)*osr * deltas for inserting them into the entropy pool which should * then have (close to) DATA_SIZE_BITS bits of entropy in the * conditioned output. * * Note, ec->rct_count (which equals to value B in the pseudo * code of SP800-90B section 4.4.1) starts with zero. Hence * we need to subtract one from the cutoff value as calculated * following SP800-90B. Thus C = ceil(-log_2(alpha)/H) = 30*osr * or 60*osr. */ if ((unsigned int)ec->rct_count >= (60 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE_PERMANENT; } else if ((unsigned int)ec->rct_count >= (30 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE; } } else { /* Reset RCT */ ec->rct_count = 0; } } static inline __u64 jent_delta(__u64 prev, __u64 next) { #define JENT_UINT64_MAX (__u64)(~((__u64) 0)) return (prev < next) ? (next - prev) : (JENT_UINT64_MAX - prev + 1 + next); } /* * Stuck test by checking the: * 1st derivative of the jitter measurement (time delta) * 2nd derivative of the jitter measurement (delta of time deltas) * 3rd derivative of the jitter measurement (delta of delta of time deltas) * * All values must always be non-zero. * * @ec [in] Reference to entropy collector * @current_delta [in] Jitter time delta * * @return * 0 jitter measurement not stuck (good bit) * 1 jitter measurement stuck (reject bit) */ static int jent_stuck(struct rand_data *ec, __u64 current_delta) { __u64 delta2 = jent_delta(ec->last_delta, current_delta); __u64 delta3 = jent_delta(ec->last_delta2, delta2); ec->last_delta = current_delta; ec->last_delta2 = delta2; /* * Insert the result of the comparison of two back-to-back time * deltas. */ jent_apt_insert(ec, current_delta); if (!current_delta || !delta2 || !delta3) { /* RCT with a stuck bit */ jent_rct_insert(ec, 1); return 1; } /* RCT with a non-stuck bit */ jent_rct_insert(ec, 0); return 0; } /* * Report any health test failures * * @ec [in] Reference to entropy collector * * @return a bitmask indicating which tests failed * 0 No health test failure * 1 RCT failure * 2 APT failure * 1<<JENT_PERMANENT_FAILURE_SHIFT RCT permanent failure * 2<<JENT_PERMANENT_FAILURE_SHIFT APT permanent failure */ static unsigned int jent_health_failure(struct rand_data *ec) { /* Test is only enabled in FIPS mode */ if (!fips_enabled) return 0; return ec->health_failure; } /*************************************************************************** * Noise sources ***************************************************************************/ /* * Update of the loop count used for the next round of * an entropy collection. * * Input: * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; unsigned int i = 0; unsigned int mask = (1<<bits) - 1; jent_get_nstime(&time); /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. */ for (i = 0; ((DATA_SIZE_BITS + bits - 1) / bits) > i; i++) { shuffle ^= time & mask; time = time >> bits; } /* * We add a lower boundary value to ensure we have a minimum * RNG loop count. */ return (shuffle + (1<<min)); } /* * CPU Jitter noise source -- this is the noise source based on the CPU * execution time jitter * * This function injects the individual bits of the time value into the * entropy pool using a hash. * * ec [in] entropy collector * time [in] time stamp to be injected * stuck [in] Is the time stamp identified as stuck? * * Output: * updated hash context in the entropy collector or error code */ static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { #define SHA3_HASH_LOOP (1<<3) struct { int rct_count; unsigned int apt_observations; unsigned int apt_count; unsigned int apt_base; } addtl = { ec->rct_count, ec->apt_observations, ec->apt_count, ec->apt_base }; return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), SHA3_HASH_LOOP, stuck); } /* * Memory Access noise source -- this is a noise source based on variations in * memory access times * * This function performs memory accesses which will add to the timing * variations due to an unknown amount of CPU wait states that need to be * added when accessing memory. The memory size should be larger than the L1 * caches as outlined in the documentation and the associated testing. * * The L1 cache has a very high bandwidth, albeit its access rate is usually * slower than accessing CPU registers. Therefore, L1 accesses only add minimal * variations as the CPU has hardly to wait. Starting with L2, significant * variations are added because L2 typically does not belong to the CPU any more * and therefore a wider range of CPU wait states is necessary for accesses. * L3 and real memory accesses have even a wider range of wait states. However, * to reliably access either L3 or memory, the ec->mem memory must be quite * large which is usually not desirable. * * @ec [in] Reference to the entropy collector with the memory access data -- if * the reference to the memory block to be accessed is NULL, this noise * source is disabled * @loop_cnt [in] if a value not equal to 0 is set, use the given value * number of loops to perform the LFSR */ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) { unsigned int wrap = 0; __u64 i = 0; #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; wrap = ec->memblocksize * ec->memblocks; /* * testing purposes -- allow test app to set the counter, not * needed during runtime */ if (loop_cnt) acc_loop_cnt = loop_cnt; for (i = 0; i < (ec->memaccessloops + acc_loop_cnt); i++) { unsigned char *tmpval = ec->mem + ec->memlocation; /* * memory access: just add 1 to one byte, * wrap at 255 -- memory access implies read * from and write to memory location */ *tmpval = (*tmpval + 1) & 0xff; /* * Addition of memblocksize - 1 to pointer * with wrap around logic to ensure that every * memory location is hit evenly */ ec->memlocation = ec->memlocation + ec->memblocksize - 1; ec->memlocation = ec->memlocation % wrap; } } /*************************************************************************** * Start of entropy processing logic ***************************************************************************/ /* * This is the heart of the entropy generation: calculate time deltas and * use the CPU jitter in the time deltas. The jitter is injected into the * entropy pool. * * WARNING: ensure that ->prev_time is primed before using the output * of this function! This can be done by calling this function * and not using its result. * * @ec [in] Reference to entropy collector * * @return result of stuck test */ static int jent_measure_jitter(struct rand_data *ec, __u64 *ret_current_delta) { __u64 time = 0; __u64 current_delta = 0; int stuck; /* Invoke one noise source before time measurement to add variations */ jent_memaccess(ec, 0); /* * Get time stamp and calculate time delta to previous * invocation to measure the timing variations */ jent_get_nstime(&time); current_delta = jent_delta(ec->prev_time, time); ec->prev_time = time; /* Check whether we have a stuck measurement. */ stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ if (jent_condition_data(ec, current_delta, stuck)) stuck = 1; /* return the raw entropy value */ if (ret_current_delta) *ret_current_delta = current_delta; return stuck; } /* * Generator of one 64 bit random number * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ static void jent_gen_entropy(struct rand_data *ec) { unsigned int k = 0, safety_factor = 0; if (fips_enabled) safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec, NULL); while (!jent_health_failure(ec)) { /* If a stuck measurement is received, repeat measurement */ if (jent_measure_jitter(ec, NULL)) continue; /* * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } /* * Entry function: Obtain entropy for the caller. * * This function invokes the entropy gathering logic as often to generate * as many bytes as requested by the caller. The entropy gathering logic * creates 64 bit per invocation. * * This function truncates the last 64 bit entropy value output to the exact * size specified by the caller. * * @ec [in] Reference to entropy collector * @data [in] pointer to buffer for storing random data -- buffer must already * exist * @len [in] size of the buffer, specifying also the requested number of random * in bytes * * @return 0 when request is fulfilled or an error * * The following error codes can occur: * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len) { unsigned char *p = data; if (!ec) return -1; while (len > 0) { unsigned int tocopy, health_test_result; jent_gen_entropy(ec); health_test_result = jent_health_failure(ec); if (health_test_result > JENT_PERMANENT_FAILURE_SHIFT) { /* * At this point, the Jitter RNG instance is considered * as a failed instance. There is no rerun of the * startup test any more, because the caller * is assumed to not further use this instance. */ return -3; } else if (health_test_result) { /* * Perform startup health tests and return permanent * error if it fails. */ if (jent_entropy_init(0, 0, NULL, ec)) { /* Mark the permanent error */ ec->health_failure &= JENT_RCT_FAILURE_PERMANENT | JENT_APT_FAILURE_PERMANENT; return -3; } return -2; } tocopy = min(DATA_SIZE_BITS / 8, len); if (jent_read_random_block(ec->hash_state, p, tocopy)) return -1; len -= tocopy; p += tocopy; } return 0; } /*************************************************************************** * Initialization logic ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, unsigned int flags, void *hash_state) { struct rand_data *entropy_collector; entropy_collector = jent_zalloc(sizeof(struct rand_data)); if (!entropy_collector) return NULL; if (!(flags & JENT_DISABLE_MEMORY_ACCESS)) { /* Allocate memory for adding variations based on memory * access */ entropy_collector->mem = jent_kvzalloc(JENT_MEMORY_SIZE); if (!entropy_collector->mem) { jent_zfree(entropy_collector); return NULL; } entropy_collector->memblocksize = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE; entropy_collector->memblocks = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS; entropy_collector->memaccessloops = JENT_MEMORY_ACCESSLOOPS; } /* verify and set the oversampling rate */ if (osr == 0) osr = 1; /* H_submitter = 1 / osr */ entropy_collector->osr = osr; entropy_collector->flags = flags; entropy_collector->hash_state = hash_state; /* Initialize the APT */ jent_apt_init(entropy_collector, osr); /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); return entropy_collector; } void jent_entropy_collector_free(struct rand_data *entropy_collector) { jent_kvzfree(entropy_collector->mem, JENT_MEMORY_SIZE); entropy_collector->mem = NULL; jent_zfree(entropy_collector); } int jent_entropy_init(unsigned int osr, unsigned int flags, void *hash_state, struct rand_data *p_ec) { /* * If caller provides an allocated ec, reuse it which implies that the * health test entropy data is used to further still the available * entropy pool. */ struct rand_data *ec = p_ec; int i, time_backwards = 0, ret = 0, ec_free = 0; unsigned int health_test_result; if (!ec) { ec = jent_entropy_collector_alloc(osr, flags, hash_state); if (!ec) return JENT_EMEM; ec_free = 1; } else { /* Reset the APT */ jent_apt_reset(ec, 0); /* Ensure that a new APT base is obtained */ ec->apt_base_set = 0; /* Reset the RCT */ ec->rct_count = 0; /* Reset intermittent, leave permanent health test result */ ec->health_failure &= (~JENT_RCT_FAILURE); ec->health_failure &= (~JENT_APT_FAILURE); } /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These * loop counts may show some slight skew and we produce * false positives. * * Moreover, only old systems show potentially problematic * jitter entropy that could potentially be caught here. But * the RNG is intended for hardware that is available or widely * used, but not old systems that are long out of favor. Thus, * no statistical tests. */ /* * We could add a check for system capabilities such as clock_getres or * check for CONFIG_X86_TSC, but it does not make much sense as the * following sanity checks verify that we have a high-resolution * timer. */ /* * TESTLOOPCOUNT needs some loops to identify edge systems. 100 is * definitely too little. * * SP800-90B requires at least 1024 initial test cycles. */ #define TESTLOOPCOUNT 1024 #define CLEARCACHE 100 for (i = 0; (TESTLOOPCOUNT + CLEARCACHE) > i; i++) { __u64 start_time = 0, end_time = 0, delta = 0; /* Invoke core entropy collection logic */ jent_measure_jitter(ec, &delta); end_time = ec->prev_time; start_time = ec->prev_time - delta; /* test whether timer works */ if (!start_time || !end_time) { ret = JENT_ENOTIME; goto out; } /* * test whether timer is fine grained enough to provide * delta even when called shortly after each other -- this * implies that we also have a high resolution timer */ if (!delta || (end_time == start_time)) { ret = JENT_ECOARSETIME; goto out; } /* * up to here we did not modify any variable that will be * evaluated later, but we already performed some work. Thus we * already have had an impact on the caches, branch prediction, * etc. with the goal to clear it to get the worst case * measurements. */ if (i < CLEARCACHE) continue; /* test whether we have an increasing timer */ if (!(end_time > start_time)) time_backwards++; } /* * we allow up to three times the time running backwards. * CLOCK_REALTIME is affected by adjtime and NTP operations. Thus, * if such an operation just happens to interfere with our test, it * should not fail. The value of 3 should cover the NTP case being * performed during our test run. */ if (time_backwards > 3) { ret = JENT_ENOMONOTONIC; goto out; } /* Did we encounter a health test failure? */ health_test_result = jent_health_failure(ec); if (health_test_result) { ret = (health_test_result & JENT_RCT_FAILURE) ? JENT_ERCT : JENT_EHEALTH; goto out; } out: if (ec_free) jent_entropy_collector_free(ec); return ret; }
15 9 5 1 5 9 8 1 1 3 2 2 1 1 1 1 13 13 9 6 1 21 21 15 1 15 18 4 1 25 2 23 23 27 5 21 3 3 4 49 49 49 49 49 49 57 57 56 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 // SPDX-License-Identifier: GPL-2.0-only /* * Minimal file system backend for holding eBPF maps and programs, * used by bpf(2) object pinning. * * Authors: * * Daniel Borkmann <daniel@iogearbox.net> */ #include <linux/init.h> #include <linux/magic.h> #include <linux/major.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/kdev_t.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/kstrtox.h> #include "preload/bpf_preload.h" enum bpf_type { BPF_TYPE_UNSPEC = 0, BPF_TYPE_PROG, BPF_TYPE_MAP, BPF_TYPE_LINK, }; static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: bpf_prog_inc(raw); break; case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); break; case BPF_TYPE_LINK: bpf_link_inc(raw); break; default: WARN_ON_ONCE(1); break; } return raw; } static void bpf_any_put(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: bpf_prog_put(raw); break; case BPF_TYPE_MAP: bpf_map_put_with_uref(raw); break; case BPF_TYPE_LINK: bpf_link_put(raw); break; default: WARN_ON_ONCE(1); break; } } static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) { void *raw; raw = bpf_map_get_with_uref(ufd); if (!IS_ERR(raw)) { *type = BPF_TYPE_MAP; return raw; } raw = bpf_prog_get(ufd); if (!IS_ERR(raw)) { *type = BPF_TYPE_PROG; return raw; } raw = bpf_link_get_from_fd(ufd); if (!IS_ERR(raw)) { *type = BPF_TYPE_LINK; return raw; } return ERR_PTR(-EINVAL); } static const struct inode_operations bpf_dir_iops; static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; static const struct inode_operations bpf_link_iops = { }; struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode) { struct inode *inode; switch (mode & S_IFMT) { case S_IFDIR: case S_IFREG: case S_IFLNK: break; default: return ERR_PTR(-EINVAL); } inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); return inode; } static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) { *type = BPF_TYPE_UNSPEC; if (inode->i_op == &bpf_prog_iops) *type = BPF_TYPE_PROG; else if (inode->i_op == &bpf_map_iops) *type = BPF_TYPE_MAP; else if (inode->i_op == &bpf_link_iops) *type = BPF_TYPE_LINK; else return -EACCES; return 0; } static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, struct inode *dir) { d_instantiate(dentry, inode); dget(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; inc_nlink(inode); inc_nlink(dir); bpf_dentry_finalize(dentry, inode, dir); return NULL; } struct map_iter { void *key; bool done; }; static struct map_iter *map_iter(struct seq_file *m) { return m->private; } static struct bpf_map *seq_file_to_map(struct seq_file *m) { return file_inode(m->file)->i_private; } static void map_iter_free(struct map_iter *iter) { if (iter) { kfree(iter->key); kfree(iter); } } static struct map_iter *map_iter_alloc(struct bpf_map *map) { struct map_iter *iter; iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); if (!iter) goto error; iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); if (!iter->key) goto error; return iter; error: map_iter_free(iter); return NULL; } static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; void *prev_key; (*pos)++; if (map_iter(m)->done) return NULL; if (unlikely(v == SEQ_START_TOKEN)) prev_key = NULL; else prev_key = key; rcu_read_lock(); if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; key = NULL; } rcu_read_unlock(); return key; } static void *map_seq_start(struct seq_file *m, loff_t *pos) { if (map_iter(m)->done) return NULL; return *pos ? map_iter(m)->key : SEQ_START_TOKEN; } static void map_seq_stop(struct seq_file *m, void *v) { } static int map_seq_show(struct seq_file *m, void *v) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; if (unlikely(v == SEQ_START_TOKEN)) { seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); seq_puts(m, "# WARNING!! The output format will change\n"); } else { map->ops->map_seq_show_elem(map, key, m); } return 0; } static const struct seq_operations bpffs_map_seq_ops = { .start = map_seq_start, .next = map_seq_next, .show = map_seq_show, .stop = map_seq_stop, }; static int bpffs_map_open(struct inode *inode, struct file *file) { struct bpf_map *map = inode->i_private; struct map_iter *iter; struct seq_file *m; int err; iter = map_iter_alloc(map); if (!iter) return -ENOMEM; err = seq_open(file, &bpffs_map_seq_ops); if (err) { map_iter_free(iter); return err; } m = file->private_data; m->private = iter; return 0; } static int bpffs_map_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; map_iter_free(map_iter(m)); return seq_release(inode, file); } /* bpffs_map_fops should only implement the basic * read operation for a BPF map. The purpose is to * provide a simple user intuitive way to do * "cat bpffs/pathto/a-pinned-map". * * Other operations (e.g. write, lookup...) should be realized by * the userspace tools (e.g. bpftool) through the * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update * interface. */ static const struct file_operations bpffs_map_fops = { .open = bpffs_map_open, .read = seq_read, .release = bpffs_map_release, }; static int bpffs_obj_open(struct inode *inode, struct file *file) { return -EIO; } static const struct file_operations bpffs_obj_fops = { .open = bpffs_obj_open, }; static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, const struct inode_operations *iops, const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = iops; inode->i_fop = fops; inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); return 0; } static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, &bpffs_obj_fops); } static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) { struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, bpf_map_support_seq_show(map) ? &bpffs_map_fops : &bpffs_obj_fops); } static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) { struct bpf_link *link = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, bpf_link_is_iter(link) ? &bpf_iter_fops : &bpffs_obj_fops); } static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future * extensions. That allows popoulate_bpffs() create special files. */ if ((dir->i_mode & S_IALLUGO) && strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); return simple_lookup(dir, dentry, flags); } static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); struct inode *inode; if (!link) return -ENOMEM; inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); if (IS_ERR(inode)) { kfree(link); return PTR_ERR(inode); } inode->i_op = &simple_symlink_inode_operations; inode->i_link = link; bpf_dentry_finalize(dentry, inode, dir); return 0; } static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mkdir = bpf_mkdir, .symlink = bpf_symlink, .rmdir = simple_rmdir, .rename = simple_rename, .link = simple_link, .unlink = simple_unlink, }; /* pin iterator link into bpffs */ static int bpf_iter_link_pin_kernel(struct dentry *parent, const char *name, struct bpf_link *link) { umode_t mode = S_IFREG | S_IRUSR; struct dentry *dentry; int ret; inode_lock(parent->d_inode); dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { inode_unlock(parent->d_inode); return PTR_ERR(dentry); } ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, &bpf_iter_fops); dput(dentry); inode_unlock(parent->d_inode); return ret; } static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, enum bpf_type type) { struct dentry *dentry; struct inode *dir; struct path path; umode_t mode; int ret; dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); dir = d_inode(path.dentry); if (dir->i_op != &bpf_dir_iops) { ret = -EPERM; goto out; } mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); ret = security_path_mknod(&path, dentry, mode, 0); if (ret) goto out; switch (type) { case BPF_TYPE_PROG: ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); break; case BPF_TYPE_MAP: ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); break; case BPF_TYPE_LINK: ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); break; default: ret = -EPERM; } out: end_creating_path(&path, dentry); return ret; } int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname) { enum bpf_type type; void *raw; int ret; raw = bpf_fd_probe_obj(ufd, &type); if (IS_ERR(raw)) return PTR_ERR(raw); ret = bpf_obj_do_pin(path_fd, pathname, raw, type); if (ret != 0) bpf_any_put(raw, type); return ret; } static void *bpf_obj_do_get(int path_fd, const char __user *pathname, enum bpf_type *type, int flags) { struct inode *inode; struct path path; void *raw; int ret; ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); inode = d_backing_inode(path.dentry); ret = path_permission(&path, ACC_MODE(flags)); if (ret) goto out; ret = bpf_inode_type(inode, type); if (ret) goto out; raw = bpf_any_get(inode->i_private, *type); if (!IS_ERR(raw)) touch_atime(&path); path_put(&path); return raw; out: path_put(&path); return ERR_PTR(ret); } int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; int f_flags; void *raw; int ret; f_flags = bpf_get_file_flag(flags); if (f_flags < 0) return f_flags; raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags); if (IS_ERR(raw)) return PTR_ERR(raw); if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); else if (type == BPF_TYPE_LINK) ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw); else return -ENOENT; if (ret < 0) bpf_any_put(raw, type); return ret; } static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (ret) return ERR_PTR(ret); if (inode->i_op == &bpf_map_iops) return ERR_PTR(-EINVAL); if (inode->i_op == &bpf_link_iops) return ERR_PTR(-EINVAL); if (inode->i_op != &bpf_prog_iops) return ERR_PTR(-EACCES); prog = inode->i_private; ret = security_bpf_prog(prog); if (ret < 0) return ERR_PTR(ret); if (!bpf_prog_get_ok(prog, &type, false)) return ERR_PTR(-EINVAL); bpf_prog_inc(prog); return prog; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { struct bpf_prog *prog; struct path path; int ret = kern_path(name, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); prog = __get_prog_inode(d_backing_inode(path.dentry), type); if (!IS_ERR(prog)) touch_atime(&path); path_put(&path); return prog; } EXPORT_SYMBOL(bpf_prog_get_type_path); struct bpffs_btf_enums { const struct btf *btf; const struct btf_type *cmd_t; const struct btf_type *map_t; const struct btf_type *prog_t; const struct btf_type *attach_t; }; static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) { const struct btf *btf; const struct btf_type *t; const char *name; int i, n; memset(info, 0, sizeof(*info)); btf = bpf_get_btf_vmlinux(); if (IS_ERR(btf)) return PTR_ERR(btf); if (!btf) return -ENOENT; info->btf = btf; for (i = 1, n = btf_nr_types(btf); i < n; i++) { t = btf_type_by_id(btf, i); if (!btf_type_is_enum(t)) continue; name = btf_name_by_offset(btf, t->name_off); if (!name) continue; if (strcmp(name, "bpf_cmd") == 0) info->cmd_t = t; else if (strcmp(name, "bpf_map_type") == 0) info->map_t = t; else if (strcmp(name, "bpf_prog_type") == 0) info->prog_t = t; else if (strcmp(name, "bpf_attach_type") == 0) info->attach_t = t; else continue; if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) return 0; } return -ESRCH; } static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, const char *prefix, const char *str, int *value) { const struct btf_enum *e; const char *name; int i, n, pfx_len = strlen(prefix); *value = 0; if (!btf || !enum_t) return false; for (i = 0, n = btf_vlen(enum_t); i < n; i++) { e = &btf_enum(enum_t)[i]; name = btf_name_by_offset(btf, e->name_off); if (!name || strncasecmp(name, prefix, pfx_len) != 0) continue; /* match symbolic name case insensitive and ignoring prefix */ if (strcasecmp(name + pfx_len, str) == 0) { *value = e->val; return true; } } return false; } static void seq_print_delegate_opts(struct seq_file *m, const char *opt_name, const struct btf *btf, const struct btf_type *enum_t, const char *prefix, u64 delegate_msk, u64 any_msk) { const struct btf_enum *e; bool first = true; const char *name; u64 msk; int i, n, pfx_len = strlen(prefix); delegate_msk &= any_msk; /* clear unknown bits */ if (delegate_msk == 0) return; seq_printf(m, ",%s", opt_name); if (delegate_msk == any_msk) { seq_printf(m, "=any"); return; } if (btf && enum_t) { for (i = 0, n = btf_vlen(enum_t); i < n; i++) { e = &btf_enum(enum_t)[i]; name = btf_name_by_offset(btf, e->name_off); if (!name || strncasecmp(name, prefix, pfx_len) != 0) continue; msk = 1ULL << e->val; if (delegate_msk & msk) { /* emit lower-case name without prefix */ seq_putc(m, first ? '=' : ':'); name += pfx_len; while (*name) { seq_putc(m, tolower(*name)); name++; } delegate_msk &= ~msk; first = false; } } } if (delegate_msk) seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk); } /* * Display the mount options in /proc/mounts. */ static int bpf_show_options(struct seq_file *m, struct dentry *root) { struct inode *inode = d_inode(root); umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; struct bpf_mount_opts *opts = root->d_sb->s_fs_info; u64 mask; if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, inode->i_uid)); if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, inode->i_gid)); if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); if (opts->delegate_cmds || opts->delegate_maps || opts->delegate_progs || opts->delegate_attachs) { struct bpffs_btf_enums info; /* ignore errors, fallback to hex */ (void)find_bpffs_btf_enums(&info); mask = (1ULL << __MAX_BPF_CMD) - 1; seq_print_delegate_opts(m, "delegate_cmds", info.btf, info.cmd_t, "BPF_", opts->delegate_cmds, mask); mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; seq_print_delegate_opts(m, "delegate_maps", info.btf, info.map_t, "BPF_MAP_TYPE_", opts->delegate_maps, mask); mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; seq_print_delegate_opts(m, "delegate_progs", info.btf, info.prog_t, "BPF_PROG_TYPE_", opts->delegate_progs, mask); mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; seq_print_delegate_opts(m, "delegate_attachs", info.btf, info.attach_t, "BPF_", opts->delegate_attachs, mask); } return 0; } static void bpf_free_inode(struct inode *inode) { enum bpf_type type; if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); free_inode_nonrcu(inode); } const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; enum { OPT_UID, OPT_GID, OPT_MODE, OPT_DELEGATE_CMDS, OPT_DELEGATE_MAPS, OPT_DELEGATE_PROGS, OPT_DELEGATE_ATTACHS, }; static const struct fs_parameter_spec bpf_fs_parameters[] = { fsparam_u32 ("uid", OPT_UID), fsparam_u32 ("gid", OPT_GID), fsparam_u32oct ("mode", OPT_MODE), fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS), fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS), {} }; static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct bpf_mount_opts *opts = fc->s_fs_info; struct fs_parse_result result; kuid_t uid; kgid_t gid; int opt, err; opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) { /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ if (opt == -ENOPARAM) { opt = vfs_parse_fs_param_source(fc, param); if (opt != -ENOPARAM) return opt; return 0; } if (opt < 0) return opt; } switch (opt) { case OPT_UID: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) goto bad_value; /* * The requested uid must be representable in the * filesystem's idmapping. */ if (!kuid_has_mapping(fc->user_ns, uid)) goto bad_value; opts->uid = uid; break; case OPT_GID: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) goto bad_value; /* * The requested gid must be representable in the * filesystem's idmapping. */ if (!kgid_has_mapping(fc->user_ns, gid)) goto bad_value; opts->gid = gid; break; case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; case OPT_DELEGATE_CMDS: case OPT_DELEGATE_MAPS: case OPT_DELEGATE_PROGS: case OPT_DELEGATE_ATTACHS: { struct bpffs_btf_enums info; const struct btf_type *enum_t; const char *enum_pfx; u64 *delegate_msk, msk = 0; char *p, *str; int val; /* ignore errors, fallback to hex */ (void)find_bpffs_btf_enums(&info); switch (opt) { case OPT_DELEGATE_CMDS: delegate_msk = &opts->delegate_cmds; enum_t = info.cmd_t; enum_pfx = "BPF_"; break; case OPT_DELEGATE_MAPS: delegate_msk = &opts->delegate_maps; enum_t = info.map_t; enum_pfx = "BPF_MAP_TYPE_"; break; case OPT_DELEGATE_PROGS: delegate_msk = &opts->delegate_progs; enum_t = info.prog_t; enum_pfx = "BPF_PROG_TYPE_"; break; case OPT_DELEGATE_ATTACHS: delegate_msk = &opts->delegate_attachs; enum_t = info.attach_t; enum_pfx = "BPF_"; break; default: return -EINVAL; } str = param->string; while ((p = strsep(&str, ":"))) { if (strcmp(p, "any") == 0) { msk |= ~0ULL; } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { msk |= 1ULL << val; } else { err = kstrtou64(p, 0, &msk); if (err) return err; } } /* Setting delegation mount options requires privileges */ if (msk && !capable(CAP_SYS_ADMIN)) return -EPERM; *delegate_msk |= msk; break; } default: /* ignore unknown mount options */ break; } return 0; bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } struct bpf_preload_ops *bpf_preload_ops; EXPORT_SYMBOL_GPL(bpf_preload_ops); static bool bpf_preload_mod_get(void) { /* If bpf_preload.ko wasn't loaded earlier then load it now. * When bpf_preload is built into vmlinux the module's __init * function will populate it. */ if (!bpf_preload_ops) { request_module("bpf_preload"); if (!bpf_preload_ops) return false; } /* And grab the reference, so the module doesn't disappear while the * kernel is interacting with the kernel module and its UMD. */ if (!try_module_get(bpf_preload_ops->owner)) { pr_err("bpf_preload module get failed.\n"); return false; } return true; } static void bpf_preload_mod_put(void) { if (bpf_preload_ops) /* now user can "rmmod bpf_preload" if necessary */ module_put(bpf_preload_ops->owner); } static DEFINE_MUTEX(bpf_preload_lock); static int populate_bpffs(struct dentry *parent) { struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; int err = 0, i; /* grab the mutex to make sure the kernel interactions with bpf_preload * are serialized */ mutex_lock(&bpf_preload_lock); /* if bpf_preload.ko wasn't built into vmlinux then load it */ if (!bpf_preload_mod_get()) goto out; err = bpf_preload_ops->preload(objs); if (err) goto out_put; for (i = 0; i < BPF_PRELOAD_LINKS; i++) { bpf_link_inc(objs[i].link); err = bpf_iter_link_pin_kernel(parent, objs[i].link_name, objs[i].link); if (err) { bpf_link_put(objs[i].link); goto out_put; } } out_put: bpf_preload_mod_put(); out: mutex_unlock(&bpf_preload_lock); return err; } static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; int ret; /* Mounting an instance of BPF FS requires privileges */ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) return -EPERM; ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; sb->s_op = &bpf_super_ops; inode = sb->s_root->d_inode; inode->i_uid = opts->uid; inode->i_gid = opts->gid; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; return 0; } static int bpf_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, bpf_fill_super); } static void bpf_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations bpf_context_ops = { .free = bpf_free_fc, .parse_param = bpf_parse_param, .get_tree = bpf_get_tree, }; /* * Set up the filesystem mount context. */ static int bpf_init_fs_context(struct fs_context *fc) { struct bpf_mount_opts *opts; opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); if (!opts) return -ENOMEM; opts->mode = S_IRWXUGO; opts->uid = current_fsuid(); opts->gid = current_fsgid(); /* start out with no BPF token delegation enabled */ opts->delegate_cmds = 0; opts->delegate_maps = 0; opts->delegate_progs = 0; opts->delegate_attachs = 0; fc->s_fs_info = opts; fc->ops = &bpf_context_ops; return 0; } static void bpf_kill_super(struct super_block *sb) { struct bpf_mount_opts *opts = sb->s_fs_info; kill_litter_super(sb); kfree(opts); } static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", .init_fs_context = bpf_init_fs_context, .parameters = bpf_fs_parameters, .kill_sb = bpf_kill_super, .fs_flags = FS_USERNS_MOUNT, }; static int __init bpf_init(void) { int ret; ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) return ret; ret = register_filesystem(&bpf_fs_type); if (ret) sysfs_remove_mount_point(fs_kobj, "bpf"); return ret; } fs_initcall(bpf_init);
58 17845 401 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM x86_fpu #if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FPU_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(x86_fpu, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu), TP_STRUCT__entry( __field(struct fpu *, fpu) __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH asm/trace/ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE fpu #endif /* _TRACE_FPU_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 // SPDX-License-Identifier: GPL-2.0 /* xfrm_hash.c: Common hash table code. * * Copyright (C) 2006 David S. Miller (davem@davemloft.net) */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/xfrm.h> #include "xfrm_hash.h" struct hlist_head *xfrm_hash_alloc(unsigned int sz) { struct hlist_head *n; if (sz <= PAGE_SIZE) n = kzalloc(sz, GFP_KERNEL); else if (hashdist) n = vzalloc(sz); else n = (struct hlist_head *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); return n; } void xfrm_hash_free(struct hlist_head *n, unsigned int sz) { if (sz <= PAGE_SIZE) kfree(n); else if (hashdist) vfree(n); else free_pages((unsigned long)n, get_order(sz)); }
11 11 11 11 48 39 2 6 1 1 2 6 4 3 3 3 2 15 22 23 6 3 2 11 21 1 2 1 2 11 2 2 9 7 10 5 11 9 5 5 16 16 6 8 2 1 1 71 3 1 48 1 24 3 3 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; struct nf_conntrack_helper *helper6; u8 l4proto; }; #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, enum nft_ct_keys k, enum ip_conntrack_dir d) { if (d < IP_CT_DIR_MAX) return k == NFT_CT_BYTES ? atomic64_read(&c[d].bytes) : atomic64_read(&c[d].packets); return nft_ct_get_eval_counter(c, k, IP_CT_DIR_ORIGINAL) + nft_ct_get_eval_counter(c, k, IP_CT_DIR_REPLY); } static void nft_ct_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; const struct nf_conn_help *help; const struct nf_conntrack_tuple *tuple; const struct nf_conntrack_helper *helper; unsigned int state; ct = nf_ct_get(pkt->skb, &ctinfo); switch (priv->key) { case NFT_CT_STATE: if (ct) state = NF_CT_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) state = NF_CT_STATE_UNTRACKED_BIT; else state = NF_CT_STATE_INVALID_BIT; *dest = state; return; default: break; } if (ct == NULL) goto err; switch (priv->key) { case NFT_CT_DIRECTION: nft_reg_store8(dest, CTINFO2DIR(ctinfo)); return; case NFT_CT_STATUS: *dest = ct->status; return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: *dest = READ_ONCE(ct->mark); return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: *dest = ct->secmark; return; #endif case NFT_CT_EXPIRATION: *dest = jiffies_to_msecs(nf_ct_expires(ct)); return; case NFT_CT_HELPER: if (ct->master == NULL) goto err; help = nfct_help(ct->master); if (help == NULL) goto err; helper = rcu_dereference(help->helper); if (helper == NULL) goto err; strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { struct nf_conn_labels *labels = nf_ct_labels_find(ct); if (labels) memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE); else memset(dest, 0, NF_CT_LABELS_MAX_SIZE); return; } #endif case NFT_CT_BYTES: case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 count = 0; if (acct) count = nft_ct_get_eval_counter(acct->counter, priv->key, priv->dir); memcpy(dest, &count, sizeof(count)); return; } case NFT_CT_AVGPKT: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 avgcnt = 0, bcnt = 0, pcnt = 0; if (acct) { pcnt = nft_ct_get_eval_counter(acct->counter, NFT_CT_PKTS, priv->dir); bcnt = nft_ct_get_eval_counter(acct->counter, NFT_CT_BYTES, priv->dir); if (pcnt != 0) avgcnt = div64_u64(bcnt, pcnt); } memcpy(dest, &avgcnt, sizeof(avgcnt)); return; } case NFT_CT_L3PROTOCOL: nft_reg_store8(dest, nf_ct_l3num(ct)); return; case NFT_CT_PROTOCOL: nft_reg_store8(dest, nf_ct_protonum(ct)); return; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); u16 zoneid; if (priv->dir < IP_CT_DIR_MAX) zoneid = nf_ct_zone_id(zone, priv->dir); else zoneid = zone->id; nft_reg_store16(dest, zoneid); return; } #endif case NFT_CT_ID: *dest = nf_ct_get_id(ct); return; default: break; } tuple = &ct->tuplehash[priv->dir].tuple; switch (priv->key) { case NFT_CT_SRC: memcpy(dest, tuple->src.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_DST: memcpy(dest, tuple->dst.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_PROTO_SRC: nft_reg_store16(dest, (__force u16)tuple->src.u.all); return; case NFT_CT_PROTO_DST: nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; case NFT_CT_SRC_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; *dest = (__force __u32)tuple->src.u3.ip; return; case NFT_CT_DST_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; *dest = (__force __u32)tuple->dst.u3.ip; return; case NFT_CT_SRC_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) goto err; memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr)); return; case NFT_CT_DST_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) goto err; memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr)); return; default: break; } return; err: regs->verdict.code = NFT_BREAK; } #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_set_zone_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR }; const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(&regs->data[priv->sreg]); struct nf_conn *ct; int oldcnt; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ return; zone.id = value; switch (priv->dir) { case IP_CT_DIR_ORIGINAL: zone.dir = NF_CT_ZONE_DIR_ORIG; break; case IP_CT_DIR_REPLY: zone.dir = NF_CT_ZONE_DIR_REPL; break; default: break; } ct = this_cpu_read(nft_ct_pcpu_template); __refcount_inc(&ct->ct_general.use, &oldcnt); if (likely(oldcnt == 1)) { nf_ct_zone_add(ct, &zone); } else { refcount_dec(&ct->ct_general.use); /* previous skb got queued to userspace, allocate temporary * one until percpu template can be reused. */ ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); if (!ct) { regs->verdict.code = NF_DROP; return; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); } #endif static void nft_ct_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; #if defined(CONFIG_NF_CONNTRACK_MARK) || defined(CONFIG_NF_CONNTRACK_SECMARK) u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL || nf_ct_is_template(ct)) return; switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: if (READ_ONCE(ct->mark) != value) { WRITE_ONCE(ct->mark, value); nf_conntrack_event_cache(IPCT_MARK, ct); } break; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: if (ct->secmark != value) { ct->secmark = value; nf_conntrack_event_cache(IPCT_SECMARK, ct); } break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_replace(ct, &regs->data[priv->sreg], &regs->data[priv->sreg], NF_CT_LABELS_MAX_SIZE / sizeof(u32)); break; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS case NFT_CT_EVENTMASK: { struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); u32 ctmask = regs->data[priv->sreg]; if (e) { if (e->ctmask != ctmask) e->ctmask = ctmask; break; } if (ctmask && !nf_ct_is_confirmed(ct)) nf_ct_ecache_ext_add(ct, ctmask, 0, GFP_ATOMIC); break; } #endif default: break; } } static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, [NFTA_CT_SREG] = { .type = NLA_U32 }, }; #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_tmpl_put_pcpu(void) { struct nf_conn *ct; int cpu; for_each_possible_cpu(cpu) { ct = per_cpu(nft_ct_pcpu_template, cpu); if (!ct) break; nf_ct_put(ct); per_cpu(nft_ct_pcpu_template, cpu) = NULL; } } static bool nft_ct_tmpl_alloc_pcpu(void) { struct nf_conntrack_zone zone = { .id = 0 }; struct nf_conn *tmp; int cpu; if (nft_ct_pcpu_template_refcnt) return true; for_each_possible_cpu(cpu) { tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL); if (!tmp) { nft_ct_tmpl_put_pcpu(); return false; } __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } return true; } #endif static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); priv->dir = IP_CT_DIR_MAX; switch (priv->key) { case NFT_CT_DIRECTION: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = sizeof(u8); break; case NFT_CT_STATE: case NFT_CT_STATUS: #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: #endif case NFT_CT_EXPIRATION: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = sizeof(u32); break; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; break; #endif case NFT_CT_HELPER: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_HELPER_NAME_LEN; break; case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: /* For compatibility, do not report error if NFTA_CT_DIRECTION * attribute is specified. */ len = sizeof(u8); break; case NFT_CT_SRC: case NFT_CT_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFPROTO_IPV6: case NFPROTO_INET: len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; default: return -EAFNOSUPPORT; } break; case NFT_CT_SRC_IP: case NFT_CT_DST_IP: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u.all); break; case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: len = sizeof(u64); break; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: len = sizeof(u16); break; #endif case NFT_CT_ID: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; default: return -EOPNOTSUPP; } if (tb[NFTA_CT_DIRECTION] != NULL) { priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); switch (priv->dir) { case IP_CT_DIR_ORIGINAL: case IP_CT_DIR_REPLY: break; default: return -EINVAL; } } priv->len = len; err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); if (err < 0) return err; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) return err; if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS || priv->key == NFT_CT_AVGPKT) nf_ct_set_acct(ctx->net, true); return 0; } static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) { switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_put(ctx->net); break; #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); mutex_unlock(&nft_ct_pcpu_mutex); break; #endif default: break; } } static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); unsigned int len; int err; priv->dir = IP_CT_DIR_MAX; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof_field(struct nf_conn, mark); break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); if (err) return err; break; #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: mutex_lock(&nft_ct_pcpu_mutex); if (!nft_ct_tmpl_alloc_pcpu()) { mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; } nft_ct_pcpu_template_refcnt++; mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS case NFT_CT_EVENTMASK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; #endif default: return -EOPNOTSUPP; } if (tb[NFTA_CT_DIRECTION]) { priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); switch (priv->dir) { case IP_CT_DIR_ORIGINAL: case IP_CT_DIR_REPLY: break; default: err = -EINVAL; goto err1; } } priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err1; return 0; err1: __nft_ct_set_destroy(ctx, priv); return err; } static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_ct *priv = nft_expr_priv(expr); __nft_ct_set_destroy(ctx, priv); nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CT_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; switch (priv->key) { case NFT_CT_SRC: case NFT_CT_DST: case NFT_CT_SRC_IP: case NFT_CT_DST_IP: case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } static bool nft_ct_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_ct *priv = nft_expr_priv(expr); const struct nft_ct *ct; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } ct = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != ct->key) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CT_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; switch (priv->key) { case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_ct_type; static const struct nft_expr_ops nft_ct_get_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_eval, .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, .reduce = nft_ct_get_reduce, }; static bool nft_ct_set_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { int i; for (i = 0; i < NFT_REG32_NUM; i++) { if (!track->regs[i].selector) continue; if (track->regs[i].selector->ops != &nft_ct_get_ops) continue; __nft_reg_track_cancel(track, i); } return false; } #ifdef CONFIG_MITIGATION_RETPOLINE static const struct nft_expr_ops nft_ct_get_fast_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_fast_eval, .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, .reduce = nft_ct_set_reduce, }; #endif static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_eval, .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, .reduce = nft_ct_set_reduce, }; #ifdef CONFIG_NF_CONNTRACK_ZONES static const struct nft_expr_ops nft_ct_set_zone_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_zone_eval, .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, .reduce = nft_ct_set_reduce, }; #endif static const struct nft_expr_ops * nft_ct_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_CT_KEY] == NULL) return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG]) { #ifdef CONFIG_MITIGATION_RETPOLINE u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (k) { case NFT_CT_STATE: case NFT_CT_DIRECTION: case NFT_CT_STATUS: case NFT_CT_MARK: case NFT_CT_SECMARK: return &nft_ct_get_fast_ops; } #endif return &nft_ct_get_ops; } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE)) return &nft_ct_set_zone_ops; #endif return &nft_ct_set_ops; } return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_ct_type __read_mostly = { .name = "ct", .select_ops = nft_ct_select_ops, .policy = nft_ct_policy, .maxattr = NFTA_CT_MAX, .owner = THIS_MODULE, }; static void nft_notrack_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); /* Previously seen (loopback or untracked)? Ignore. */ if (ct || ctinfo == IP_CT_UNTRACKED) return; nf_ct_set(skb, ct, IP_CT_UNTRACKED); } static struct nft_expr_type nft_notrack_type; static const struct nft_expr_ops nft_notrack_ops = { .type = &nft_notrack_type, .size = NFT_EXPR_SIZE(0), .eval = nft_notrack_eval, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_notrack_type __read_mostly = { .name = "notrack", .ops = &nft_notrack_ops, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT static int nft_ct_timeout_parse_policy(void *timeouts, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { struct nlattr **tb; int ret = 0; tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; ret = nla_parse_nested_deprecated(tb, l4proto->ctnl_timeout.nlattr_max, attr, l4proto->ctnl_timeout.nla_policy, NULL); if (ret < 0) goto err; ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); err: kfree(tb); return ret; } struct nft_ct_timeout_obj { struct nf_ct_timeout *timeout; u8 l4proto; }; static void nft_ct_timeout_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); struct nf_conn_timeout *timeout; const unsigned int *values; if (priv->l4proto != pkt->tprot) return; if (!ct || nf_ct_is_template(ct) || nf_ct_is_confirmed(ct)) return; timeout = nf_ct_timeout_find(ct); if (!timeout) { timeout = nf_ct_timeout_ext_add(ct, priv->timeout, GFP_ATOMIC); if (!timeout) { regs->verdict.code = NF_DROP; return; } } rcu_assign_pointer(timeout->timeout, priv->timeout); /* adjust the timeout as per 'new' state. ct is unconfirmed, * so the current timestamp must not be added. */ values = nf_ct_timeout_data(timeout); if (values) nf_ct_refresh(ct, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_conntrack_l4proto *l4proto; struct nf_ct_timeout *timeout; int l3num = ctx->family; __u8 l4num; int ret; if (!tb[NFTA_CT_TIMEOUT_L4PROTO] || !tb[NFTA_CT_TIMEOUT_DATA]) return -EINVAL; if (tb[NFTA_CT_TIMEOUT_L3PROTO]) l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO])); l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); priv->l4proto = l4num; l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err_proto_put; } timeout = kzalloc(sizeof(struct nf_ct_timeout) + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); if (timeout == NULL) { ret = -ENOMEM; goto err_proto_put; } ret = nft_ct_timeout_parse_policy(&timeout->data, l4proto, ctx->net, tb[NFTA_CT_TIMEOUT_DATA]); if (ret < 0) goto err_free_timeout; timeout->l3num = l3num; timeout->l4proto = l4proto; ret = nf_ct_netns_get(ctx->net, ctx->family); if (ret < 0) goto err_free_timeout; priv->timeout = timeout; return 0; err_free_timeout: kfree(timeout); err_proto_put: return ret; } static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_ct_timeout *timeout = priv->timeout; nf_ct_untimeout(ctx->net, timeout); nf_ct_netns_put(ctx->net, ctx->family); kfree(priv->timeout); } static int nft_ct_timeout_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_ct_timeout *timeout = priv->timeout; struct nlattr *nest_params; int ret; if (nla_put_u8(skb, NFTA_CT_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num))) return -1; nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA); if (!nest_params) return -1; ret = timeout->l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); if (ret < 0) return -1; nla_nest_end(skb, nest_params); return 0; } static const struct nla_policy nft_ct_timeout_policy[NFTA_CT_TIMEOUT_MAX + 1] = { [NFTA_CT_TIMEOUT_L3PROTO] = {.type = NLA_U16 }, [NFTA_CT_TIMEOUT_L4PROTO] = {.type = NLA_U8 }, [NFTA_CT_TIMEOUT_DATA] = {.type = NLA_NESTED }, }; static struct nft_object_type nft_ct_timeout_obj_type; static const struct nft_object_ops nft_ct_timeout_obj_ops = { .type = &nft_ct_timeout_obj_type, .size = sizeof(struct nft_ct_timeout_obj), .eval = nft_ct_timeout_obj_eval, .init = nft_ct_timeout_obj_init, .destroy = nft_ct_timeout_obj_destroy, .dump = nft_ct_timeout_obj_dump, }; static struct nft_object_type nft_ct_timeout_obj_type __read_mostly = { .type = NFT_OBJECT_CT_TIMEOUT, .ops = &nft_ct_timeout_obj_ops, .maxattr = NFTA_CT_TIMEOUT_MAX, .policy = nft_ct_timeout_policy, .owner = THIS_MODULE, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conntrack_helper *help4, *help6; char name[NF_CT_HELPER_NAME_LEN]; int family = ctx->family; int err; if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) return -EINVAL; priv->l4proto = nla_get_u8(tb[NFTA_CT_HELPER_L4PROTO]); if (!priv->l4proto) return -ENOENT; nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); if (tb[NFTA_CT_HELPER_L3PROTO]) family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); help4 = NULL; help6 = NULL; switch (family) { case NFPROTO_IPV4: if (ctx->family == NFPROTO_IPV6) return -EINVAL; help4 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_IPV6: if (ctx->family == NFPROTO_IPV4) return -EINVAL; help6 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_NETDEV: case NFPROTO_BRIDGE: case NFPROTO_INET: help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, priv->l4proto); help6 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV6, priv->l4proto); break; default: return -EAFNOSUPPORT; } /* && is intentional; only error if INET found neither ipv4 or ipv6 */ if (!help4 && !help6) return -ENOENT; priv->helper4 = help4; priv->helper6 = help6; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err_put_helper; return 0; err_put_helper: if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); return err; } static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_helper_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); struct nf_conntrack_helper *to_assign = NULL; struct nf_conn_help *help; if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct) || priv->l4proto != nf_ct_protonum(ct)) return; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: to_assign = priv->helper4; break; case NFPROTO_IPV6: to_assign = priv->helper6; break; default: WARN_ON_ONCE(1); return; } if (!to_assign) return; if (test_bit(IPS_HELPER_BIT, &ct->status)) return; help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); } } static int nft_ct_helper_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); const struct nf_conntrack_helper *helper; u16 family; if (priv->helper4 && priv->helper6) { family = NFPROTO_INET; helper = priv->helper4; } else if (priv->helper6) { family = NFPROTO_IPV6; helper = priv->helper6; } else { family = NFPROTO_IPV4; helper = priv->helper4; } if (nla_put_string(skb, NFTA_CT_HELPER_NAME, helper->name)) return -1; if (nla_put_u8(skb, NFTA_CT_HELPER_L4PROTO, priv->l4proto)) return -1; if (nla_put_be16(skb, NFTA_CT_HELPER_L3PROTO, htons(family))) return -1; return 0; } static const struct nla_policy nft_ct_helper_policy[NFTA_CT_HELPER_MAX + 1] = { [NFTA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, [NFTA_CT_HELPER_L3PROTO] = { .type = NLA_U16 }, [NFTA_CT_HELPER_L4PROTO] = { .type = NLA_U8 }, }; static struct nft_object_type nft_ct_helper_obj_type; static const struct nft_object_ops nft_ct_helper_obj_ops = { .type = &nft_ct_helper_obj_type, .size = sizeof(struct nft_ct_helper_obj), .eval = nft_ct_helper_obj_eval, .init = nft_ct_helper_obj_init, .destroy = nft_ct_helper_obj_destroy, .dump = nft_ct_helper_obj_dump, }; static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .type = NFT_OBJECT_CT_HELPER, .ops = &nft_ct_helper_obj_ops, .maxattr = NFTA_CT_HELPER_MAX, .policy = nft_ct_helper_policy, .owner = THIS_MODULE, }; struct nft_ct_expect_obj { u16 l3num; __be16 dport; u8 l4proto; u8 size; u32 timeout; }; static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_expect_obj *priv = nft_obj_data(obj); if (!tb[NFTA_CT_EXPECT_L4PROTO] || !tb[NFTA_CT_EXPECT_DPORT] || !tb[NFTA_CT_EXPECT_TIMEOUT] || !tb[NFTA_CT_EXPECT_SIZE]) return -EINVAL; priv->l3num = ctx->family; if (tb[NFTA_CT_EXPECT_L3PROTO]) priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); switch (priv->l3num) { case NFPROTO_IPV4: case NFPROTO_IPV6: if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET) break; return -EINVAL; case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */ default: return -EAFNOSUPPORT; } priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); switch (priv->l4proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: break; default: return -EOPNOTSUPP; } priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_expect_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_expect_obj *priv = nft_obj_data(obj); if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) return -1; return 0; } static void nft_ct_expect_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_expect_obj *priv = nft_obj_data(obj); struct nf_conntrack_expect *exp; enum ip_conntrack_info ctinfo; struct nf_conn_help *help; enum ip_conntrack_dir dir; u16 l3num = priv->l3num; struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) { regs->verdict.code = NFT_BREAK; return; } dir = CTINFO2DIR(ctinfo); help = nfct_help(ct); if (!help) help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (!help) { regs->verdict.code = NF_DROP; return; } if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { regs->verdict.code = NFT_BREAK; return; } if (l3num == NFPROTO_INET) l3num = nf_ct_l3num(ct); exp = nf_ct_expect_alloc(ct); if (exp == NULL) { regs->verdict.code = NF_DROP; return; } nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, priv->l4proto, NULL, &priv->dport); exp->timeout.expires = jiffies + priv->timeout * HZ; if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; } static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, }; static struct nft_object_type nft_ct_expect_obj_type; static const struct nft_object_ops nft_ct_expect_obj_ops = { .type = &nft_ct_expect_obj_type, .size = sizeof(struct nft_ct_expect_obj), .eval = nft_ct_expect_obj_eval, .init = nft_ct_expect_obj_init, .destroy = nft_ct_expect_obj_destroy, .dump = nft_ct_expect_obj_dump, }; static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { .type = NFT_OBJECT_CT_EXPECT, .ops = &nft_ct_expect_obj_ops, .maxattr = NFTA_CT_EXPECT_MAX, .policy = nft_ct_expect_policy, .owner = THIS_MODULE, }; static int __init nft_ct_module_init(void) { int err; BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE); err = nft_register_expr(&nft_ct_type); if (err < 0) return err; err = nft_register_expr(&nft_notrack_type); if (err < 0) goto err1; err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; err = nft_register_obj(&nft_ct_expect_obj_type); if (err < 0) goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err4: nft_unregister_obj(&nft_ct_expect_obj_type); #endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); err2: nft_unregister_expr(&nft_notrack_type); err1: nft_unregister_expr(&nft_ct_type); return err; } static void __exit nft_ct_module_exit(void) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); } module_init(nft_ct_module_init); module_exit(nft_ct_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); MODULE_DESCRIPTION("Netfilter nf_tables conntrack module");
60 6 4 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 // SPDX-License-Identifier: GPL-2.0-or-later /* * IP Payload Compression Protocol (IPComp) - RFC3173. * * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> * Copyright (c) 2003-2025 Herbert Xu <herbert@gondor.apana.org.au> * * Todo: * - Tunable compression parameters. * - Compression stats. * - Adaptive compression. */ #include <crypto/acompress.h> #include <linux/err.h> #include <linux/module.h> #include <linux/skbuff_ref.h> #include <linux/slab.h> #include <net/ipcomp.h> #include <net/xfrm.h> #define IPCOMP_SCRATCH_SIZE 65400 struct ipcomp_skb_cb { struct xfrm_skb_cb xfrm; struct acomp_req *req; }; struct ipcomp_data { u16 threshold; struct crypto_acomp *tfm; }; struct ipcomp_req_extra { struct xfrm_state *x; struct scatterlist sg[]; }; static inline struct ipcomp_skb_cb *ipcomp_cb(struct sk_buff *skb) { struct ipcomp_skb_cb *cb = (void *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof(skb->cb)); return cb; } static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) { struct acomp_req *req = ipcomp_cb(skb)->req; struct ipcomp_req_extra *extra; struct scatterlist *dsg; int len, dlen; if (unlikely(err)) goto out_free_req; extra = acomp_request_extra(req); dsg = extra->sg; dlen = req->dlen; pskb_trim_unique(skb, 0); __skb_put(skb, hlen); /* Only update truesize on input. */ if (!hlen) skb->truesize += dlen; skb->data_len = dlen; skb->len += dlen; do { skb_frag_t *frag; struct page *page; frag = skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags; page = sg_page(dsg); dsg = sg_next(dsg); len = PAGE_SIZE; if (dlen < len) len = dlen; skb_frag_fill_page_desc(frag, page, 0, len); skb_shinfo(skb)->nr_frags++; } while ((dlen -= len)); for (; dsg; dsg = sg_next(dsg)) __free_page(sg_page(dsg)); out_free_req: acomp_request_free(req); return err; } static int ipcomp_input_done2(struct sk_buff *skb, int err) { struct ip_comp_hdr *ipch = ip_comp_hdr(skb); const int plen = skb->len; skb->transport_header = skb->network_header + sizeof(*ipch); return ipcomp_post_acomp(skb, err, 0) ?: skb->len < (plen + sizeof(ip_comp_hdr)) ? -EINVAL : ipch->nexthdr; } static void ipcomp_input_done(void *data, int err) { struct sk_buff *skb = data; xfrm_input_resume(skb, ipcomp_input_done2(skb, err)); } static struct acomp_req *ipcomp_setup_req(struct xfrm_state *x, struct sk_buff *skb, int minhead, int dlen) { const int dnfrags = min(MAX_SKB_FRAGS, 16); struct ipcomp_data *ipcd = x->data; struct ipcomp_req_extra *extra; struct scatterlist *sg, *dsg; const int plen = skb->len; struct crypto_acomp *tfm; struct acomp_req *req; int nfrags; int total; int err; int i; ipcomp_cb(skb)->req = NULL; do { struct sk_buff *trailer; if (skb->len > PAGE_SIZE) { if (skb_linearize_cow(skb)) return ERR_PTR(-ENOMEM); nfrags = 1; break; } if (!skb_cloned(skb) && skb_headlen(skb) >= minhead) { if (!skb_is_nonlinear(skb)) { nfrags = 1; break; } else if (!skb_has_frag_list(skb)) { nfrags = skb_shinfo(skb)->nr_frags; nfrags++; break; } } nfrags = skb_cow_data(skb, skb_headlen(skb) < minhead ? minhead - skb_headlen(skb) : 0, &trailer); if (nfrags < 0) return ERR_PTR(nfrags); } while (0); tfm = ipcd->tfm; req = acomp_request_alloc_extra( tfm, sizeof(*extra) + sizeof(*sg) * (nfrags + dnfrags), GFP_ATOMIC); ipcomp_cb(skb)->req = req; if (!req) return ERR_PTR(-ENOMEM); extra = acomp_request_extra(req); extra->x = x; dsg = extra->sg; sg = dsg + dnfrags; sg_init_table(sg, nfrags); err = skb_to_sgvec(skb, sg, 0, plen); if (unlikely(err < 0)) return ERR_PTR(err); sg_init_table(dsg, dnfrags); total = 0; for (i = 0; i < dnfrags && total < dlen; i++) { struct page *page; page = alloc_page(GFP_ATOMIC); if (!page) break; sg_set_page(dsg + i, page, PAGE_SIZE, 0); total += PAGE_SIZE; } if (!i) return ERR_PTR(-ENOMEM); sg_mark_end(dsg + i - 1); dlen = min(dlen, total); acomp_request_set_params(req, sg, dsg, plen, dlen); return req; } static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { struct acomp_req *req; int err; req = ipcomp_setup_req(x, skb, 0, IPCOMP_SCRATCH_SIZE); err = PTR_ERR(req); if (IS_ERR(req)) goto out; acomp_request_set_callback(req, 0, ipcomp_input_done, skb); err = crypto_acomp_decompress(req); if (err == -EINPROGRESS) return err; out: return ipcomp_input_done2(skb, err); } int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_comp_hdr *ipch __maybe_unused; if (!pskb_may_pull(skb, sizeof(*ipch))) return -EINVAL; skb->ip_summed = CHECKSUM_NONE; /* Remove ipcomp header and decompress original payload */ __skb_pull(skb, sizeof(*ipch)); return ipcomp_decompress(x, skb); } EXPORT_SYMBOL_GPL(ipcomp_input); static int ipcomp_output_push(struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); return 0; } static int ipcomp_output_done2(struct xfrm_state *x, struct sk_buff *skb, int err) { struct ip_comp_hdr *ipch; err = ipcomp_post_acomp(skb, err, sizeof(*ipch)); if (err) goto out_ok; /* Install ipcomp header, convert into ipcomp datagram. */ ipch = ip_comp_hdr(skb); ipch->nexthdr = *skb_mac_header(skb); ipch->flags = 0; ipch->cpi = htons((u16 )ntohl(x->id.spi)); *skb_mac_header(skb) = IPPROTO_COMP; out_ok: return ipcomp_output_push(skb); } static void ipcomp_output_done(void *data, int err) { struct ipcomp_req_extra *extra; struct sk_buff *skb = data; struct acomp_req *req; req = ipcomp_cb(skb)->req; extra = acomp_request_extra(req); xfrm_output_resume(skb_to_full_sk(skb), skb, ipcomp_output_done2(extra->x, skb, err)); } static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) { struct ip_comp_hdr *ipch __maybe_unused; struct acomp_req *req; int err; req = ipcomp_setup_req(x, skb, sizeof(*ipch), skb->len - sizeof(*ipch)); err = PTR_ERR(req); if (IS_ERR(req)) goto out; acomp_request_set_callback(req, 0, ipcomp_output_done, skb); err = crypto_acomp_compress(req); if (err == -EINPROGRESS) return err; out: return ipcomp_output_done2(x, skb, err); } int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipcomp_data *ipcd = x->data; if (skb->len < ipcd->threshold) { /* Don't bother compressing */ return ipcomp_output_push(skb); } return ipcomp_compress(x, skb); } EXPORT_SYMBOL_GPL(ipcomp_output); static void ipcomp_free_data(struct ipcomp_data *ipcd) { crypto_free_acomp(ipcd->tfm); } void ipcomp_destroy(struct xfrm_state *x) { struct ipcomp_data *ipcd = x->data; if (!ipcd) return; ipcomp_free_data(ipcd); kfree(ipcd); } EXPORT_SYMBOL_GPL(ipcomp_destroy); int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { int err; struct ipcomp_data *ipcd; struct xfrm_algo_desc *calg_desc; err = -EINVAL; if (!x->calg) { NL_SET_ERR_MSG(extack, "Missing required compression algorithm"); goto out; } if (x->encap) { NL_SET_ERR_MSG(extack, "IPComp is not compatible with encapsulation"); goto out; } err = -ENOMEM; ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); if (!ipcd) goto out; ipcd->tfm = crypto_alloc_acomp(x->calg->alg_name, 0, 0); if (IS_ERR(ipcd->tfm)) goto error; calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); BUG_ON(!calg_desc); ipcd->threshold = calg_desc->uinfo.comp.threshold; x->data = ipcd; err = 0; out: return err; error: ipcomp_free_data(ipcd); kfree(ipcd); goto out; } EXPORT_SYMBOL_GPL(ipcomp_init_state); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
4 4 4 12 1 1 10 2 12 12 4 8 8 4 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/netfilter/xt_IDLETIMER.c * * Netfilter module to trigger a timer when packet matches. * After timer expires a kevent will be sent. * * Copyright (C) 2004, 2010 Nokia Corporation * Written by Timo Teras <ext-timo.teras@nokia.com> * * Converted to x_tables and reworked for upstream inclusion * by Luciano Coelho <luciano.coelho@nokia.com> * * Contact: Luciano Coelho <luciano.coelho@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/timer.h> #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_IDLETIMER.h> #include <linux/kdev_t.h> #include <linux/kobject.h> #include <linux/workqueue.h> #include <linux/sysfs.h> struct idletimer_tg { struct list_head entry; struct alarm alarm; struct timer_list timer; struct work_struct work; struct kobject *kobj; struct device_attribute attr; unsigned int refcnt; u8 timer_type; }; static LIST_HEAD(idletimer_tg_list); static DEFINE_MUTEX(list_mutex); static struct kobject *idletimer_tg_kobj; static struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { struct idletimer_tg *entry; list_for_each_entry(entry, &idletimer_tg_list, entry) { if (!strcmp(label, entry->attr.attr.name)) return entry; } return NULL; } static ssize_t idletimer_tg_show(struct device *dev, struct device_attribute *attr, char *buf) { struct idletimer_tg *timer; unsigned long expires = 0; struct timespec64 ktimespec = {}; long time_diff = 0; mutex_lock(&list_mutex); timer = __idletimer_tg_find_by_label(attr->attr.name); if (timer) { if (timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm); ktimespec = ktime_to_timespec64(expires_alarm); time_diff = ktimespec.tv_sec; } else { expires = timer->timer.expires; time_diff = jiffies_to_msecs(expires - jiffies) / 1000; } } mutex_unlock(&list_mutex); if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) return sysfs_emit(buf, "%ld\n", time_diff); return sysfs_emit(buf, "0\n"); } static void idletimer_tg_work(struct work_struct *work) { struct idletimer_tg *timer = container_of(work, struct idletimer_tg, work); sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); } static void idletimer_tg_expired(struct timer_list *t) { struct idletimer_tg *timer = timer_container_of(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; ret = xt_check_proc_name(name, size); if (ret < 0) return ret; if (!strcmp(name, "power") || !strcmp(name, "subsystem") || !strcmp(name, "uevent")) return -EINVAL; return 0; } static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } list_add(&info->timer->entry, &idletimer_tg_list); timer_setup(&info->timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) { int ret; info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } /* notify userspace */ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); list_add(&info->timer->entry, &idletimer_tg_list); pr_debug("timer type value is %u", info->timer_type); info->timer->timer_type = info->timer_type; info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout; alarm_init(&info->timer->alarm, ALARM_BOOTTIME, idletimer_tg_alarmproc); info->timer->alarm.data = info->timer; tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; } static int idletimer_tg_helper(struct idletimer_tg_info *info) { if (info->timeout == 0) { pr_debug("timeout value is zero\n"); return -EINVAL; } if (info->timeout >= INT_MAX / 1000) { pr_debug("timeout value is too big\n"); return -EINVAL; } if (info->label[0] == '\0' || strnlen(info->label, MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { pr_debug("label is empty or not nul-terminated\n"); return -EINVAL; } return 0; } static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) { struct idletimer_tg_info *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); ret = idletimer_tg_helper(info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) { struct idletimer_tg_info_v1 *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); if (info->send_nl_msg) return -EOPNOTSUPP; ret = idletimer_tg_helper((struct idletimer_tg_info *)info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } if (info->timer_type > XT_IDLETIMER_ALARM) { pr_debug("invalid value for timer type\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { if (info->timer->timer_type != info->timer_type) { pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); mutex_unlock(&list_mutex); return -EINVAL; } info->timer->refcnt++; if (info->timer_type & XT_IDLETIMER_ALARM) { /* calculate remaining expiry time */ ktime_t tout = alarm_expires_remaining(&info->timer->alarm); struct timespec64 ktimespec = ktime_to_timespec64(tout); if (ktimespec.tv_sec > 0) { pr_debug("time_expiry_remaining %lld\n", ktimespec.tv_sec); alarm_start_relative(&info->timer->alarm, tout); } } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create_v1(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); timer_shutdown_sync(&info->timer->timer); cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { alarm_cancel(&info->timer->alarm); } else { timer_shutdown_sync(&info->timer->timer); } cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { .name = "IDLETIMER", .family = NFPROTO_IPV4, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV4, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "IDLETIMER", .family = NFPROTO_IPV6, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV6, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #endif }; static struct class *idletimer_tg_class; static struct device *idletimer_tg_device; static int __init idletimer_tg_init(void) { int err; idletimer_tg_class = class_create("xt_idletimer"); err = PTR_ERR(idletimer_tg_class); if (IS_ERR(idletimer_tg_class)) { pr_debug("couldn't register device class\n"); goto out; } idletimer_tg_device = device_create(idletimer_tg_class, NULL, MKDEV(0, 0), NULL, "timers"); err = PTR_ERR(idletimer_tg_device); if (IS_ERR(idletimer_tg_device)) { pr_debug("couldn't register system device\n"); goto out_class; } idletimer_tg_kobj = &idletimer_tg_device->kobj; err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; } return 0; out_dev: device_destroy(idletimer_tg_class, MKDEV(0, 0)); out_class: class_destroy(idletimer_tg_class); out: return err; } static void __exit idletimer_tg_exit(void) { xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); device_destroy(idletimer_tg_class, MKDEV(0, 0)); class_destroy(idletimer_tg_class); } module_init(idletimer_tg_init); module_exit(idletimer_tg_exit); MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("ipt_IDLETIMER"); MODULE_ALIAS("ip6t_IDLETIMER");
19 19 2 2 12 3 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/module.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/netns/generic.h> #include <net/ip6_fib.h> #include <net/route.h> #include <net/seg6.h> #include <linux/seg6.h> #include <linux/seg6_iptunnel.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/dst_cache.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif #include <linux/netfilter.h> static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) { int head = 0; switch (tuninfo->mode) { case SEG6_IPTUN_MODE_INLINE: break; case SEG6_IPTUN_MODE_ENCAP: case SEG6_IPTUN_MODE_ENCAP_RED: head = sizeof(struct ipv6hdr); break; case SEG6_IPTUN_MODE_L2ENCAP: case SEG6_IPTUN_MODE_L2ENCAP_RED: return 0; } return ((tuninfo->srh->hdrlen + 1) << 3) + head; } struct seg6_lwt { struct dst_cache cache; struct seg6_iptunnel_encap tuninfo[]; }; static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct seg6_lwt *)lwt->data; } static inline struct seg6_iptunnel_encap * seg6_encap_lwtunnel(struct lwtunnel_state *lwt) { return seg6_lwt_lwtunnel(lwt)->tuninfo; } static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = { [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY }, }; static int nla_put_srh(struct sk_buff *skb, int attrtype, struct seg6_iptunnel_encap *tuninfo) { struct seg6_iptunnel_encap *data; struct nlattr *nla; int len; len = SEG6_IPTUN_ENCAP_SIZE(tuninfo); nla = nla_reserve(skb, attrtype, len); if (!nla) return -EMSGSIZE; data = nla_data(nla); memcpy(data, tuninfo, len); return 0; } static void set_tun_src(struct net *net, struct net_device *dev, struct in6_addr *daddr, struct in6_addr *saddr) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct in6_addr *tun_src; rcu_read_lock(); tun_src = rcu_dereference(sdata->tun_src); if (!ipv6_addr_any(tun_src)) { memcpy(saddr, tun_src, sizeof(struct in6_addr)); } else { ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC, saddr); } rcu_read_unlock(); } /* Compute flowlabel for outer IPv6 header */ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, struct ipv6hdr *inner_hdr) { int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel; __be32 flowlabel = 0; u32 hash; if (do_flowlabel > 0) { hash = skb_get_hash(skb); hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) { flowlabel = ip6_flowlabel(inner_hdr); } return flowlabel; } static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; struct ipv6_sr_hdr *isrh; int hdrlen, tot_len, err; __be32 flowlabel; hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; inner_hdr = ipv6_hdr(skb); flowlabel = seg6_make_flowlabel(net, skb, inner_hdr); skb_push(skb, tot_len); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); /* inherit tc, flowlabel and hlim * hlim will be decremented in ip6_forward() afterwards and * decapsulation will overwrite inner hlim with outer hlim */ if (skb->protocol == htons(ETH_P_IPV6)) { ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), flowlabel); hdr->hop_limit = inner_hdr->hop_limit; } else { ip6_flow_hdr(hdr, 0, flowlabel); hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); /* the control block has been erased, so we have to set the * iif once again. * We read the receiving interface index directly from the * skb->skb_iif as it is done in the IPv4 receiving path (i.e.: * ip_rcv_core(...)). */ IP6CB(skb)->iif = skb->skb_iif; } hdr->nexthdr = NEXTHDR_ROUTING; isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, osrh, hdrlen); isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, tot_len); return 0; } /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) { return __seg6_do_srh_encap(skb, osrh, proto, NULL); } EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ static int seg6_do_srh_encap_red(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto, struct dst_entry *cache_dst) { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; int hdrlen = ipv6_optlen(osrh); int red_tlv_offset, tlv_offset; struct ipv6_sr_hdr *isrh; bool skip_srh = false; __be32 flowlabel; int tot_len, err; int red_hdrlen; int tlvs_len; if (first_seg > 0) { red_hdrlen = hdrlen - sizeof(struct in6_addr); } else { /* NOTE: if tag/flags and/or other TLVs are introduced in the * seg6_iptunnel infrastructure, they should be considered when * deciding to skip the SRH. */ skip_srh = !sr_has_hmac(osrh); red_hdrlen = skip_srh ? 0 : hdrlen; } tot_len = red_hdrlen + sizeof(struct ipv6hdr); err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; inner_hdr = ipv6_hdr(skb); flowlabel = seg6_make_flowlabel(net, skb, inner_hdr); skb_push(skb, tot_len); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); /* based on seg6_do_srh_encap() */ if (skb->protocol == htons(ETH_P_IPV6)) { ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), flowlabel); hdr->hop_limit = inner_hdr->hop_limit; } else { ip6_flow_hdr(hdr, 0, flowlabel); hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); IP6CB(skb)->iif = skb->skb_iif; } /* no matter if we have to skip the SRH or not, the first segment * always comes in the pushed IPv6 header. */ hdr->daddr = osrh->segments[first_seg]; if (skip_srh) { hdr->nexthdr = proto; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); goto out; } /* we cannot skip the SRH, slow path */ hdr->nexthdr = NEXTHDR_ROUTING; isrh = (void *)hdr + sizeof(struct ipv6hdr); if (unlikely(!first_seg)) { /* this is a very rare case; we have only one SID but * we cannot skip the SRH since we are carrying some * other info. */ memcpy(isrh, osrh, hdrlen); goto srcaddr; } tlv_offset = sizeof(*osrh) + (first_seg + 1) * sizeof(struct in6_addr); red_tlv_offset = tlv_offset - sizeof(struct in6_addr); memcpy(isrh, osrh, red_tlv_offset); tlvs_len = hdrlen - tlv_offset; if (unlikely(tlvs_len > 0)) { const void *s = (const void *)osrh + tlv_offset; void *d = (void *)isrh + red_tlv_offset; memcpy(d, s, tlvs_len); } --isrh->first_segment; isrh->hdrlen -= 2; srcaddr: isrh->nexthdr = proto; set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (unlikely(!skip_srh && sr_has_hmac(isrh))) { err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif out: hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, tot_len); return 0; } static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, struct dst_entry *cache_dst) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; int hdrlen, err; hdrlen = (osrh->hdrlen + 1) << 3; err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; oldhdr = ipv6_hdr(skb); skb_pull(skb, sizeof(struct ipv6hdr)); skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(struct ipv6hdr)); skb_push(skb, sizeof(struct ipv6hdr) + hdrlen); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); memmove(hdr, oldhdr, sizeof(*hdr)); isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, osrh, hdrlen); isrh->nexthdr = hdr->nexthdr; hdr->nexthdr = NEXTHDR_ROUTING; isrh->segments[0] = hdr->daddr; hdr->daddr = isrh->segments[isrh->first_segment]; #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { struct net *net = skb_dst_dev_net(skb); err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) return err; } #endif hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen); return 0; } static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; int proto, err = 0; tinfo = seg6_encap_lwtunnel(dst->lwtstate); switch (tinfo->mode) { case SEG6_IPTUN_MODE_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); if (err) return err; break; case SEG6_IPTUN_MODE_ENCAP: case SEG6_IPTUN_MODE_ENCAP_RED: err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6); if (err) return err; if (skb->protocol == htons(ETH_P_IPV6)) proto = IPPROTO_IPV6; else if (skb->protocol == htons(ETH_P_IP)) proto = IPPROTO_IPIP; else return -EINVAL; if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) err = __seg6_do_srh_encap(skb, tinfo->srh, proto, cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, proto, cache_dst); if (err) return err; skb_set_inner_transport_header(skb, skb_transport_offset(skb)); skb_set_inner_protocol(skb, skb->protocol); skb->protocol = htons(ETH_P_IPV6); break; case SEG6_IPTUN_MODE_L2ENCAP: case SEG6_IPTUN_MODE_L2ENCAP_RED: if (!skb_mac_header_was_set(skb)) return -EINVAL; if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0) return -ENOMEM; skb_mac_header_rebuild(skb); skb_push(skb, skb->mac_len); if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) err = __seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET, cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, IPPROTO_ETHERNET, cache_dst); if (err) return err; skb->protocol = htons(ETH_P_IPV6); break; } skb_set_transport_header(skb, sizeof(struct ipv6hdr)); nf_reset_ct(skb); return 0; } /* insert an SRH within an IPv6 packet, just after the IPv6 header */ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) { return __seg6_do_srh_inline(skb, osrh, NULL); } EXPORT_SYMBOL_GPL(seg6_do_srh_inline); static int seg6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dst_input(skb); } static int seg6_input_core(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; /* We cannot dereference "orig_dst" once ip6_route_input() or * skb_dst_drop() is called. However, in order to detect a dst loop, we * need the address of its lwtstate. So, save the address of lwtstate * now and use it later as a comparison. */ lwtst = orig_dst->lwtstate; slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); err = seg6_do_srh(skb, dst); if (unlikely(err)) { dst_release(dst); goto drop; } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); /* cache only if we don't create a dst reference loop */ if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); local_bh_enable(); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, dev_net(skb->dev), NULL, skb, NULL, skb_dst_dev(skb), seg6_input_finish); return seg6_input_finish(dev_net(skb->dev), NULL, skb); drop: kfree_skb(skb); return err; } static int seg6_input_nf(struct sk_buff *skb) { struct net_device *dev = skb_dst_dev(skb); struct net *net = dev_net(skb->dev); switch (skb->protocol) { case htons(ETH_P_IP): return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL, skb, NULL, dev, seg6_input_core); case htons(ETH_P_IPV6): return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL, skb, NULL, dev, seg6_input_core); } return -EINVAL; } static int seg6_input(struct sk_buff *skb) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return seg6_input_nf(skb); return seg6_input_core(dev_net(skb->dev), NULL, skb); } static int seg6_output_core(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct seg6_lwt *slwt; int err; slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); err = seg6_do_srh(skb, dst); if (unlikely(err)) goto drop; if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; goto drop; } /* cache only if we don't create a dst reference loop */ if (orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); local_bh_enable(); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst_dev(dst), dst_output); return dst_output(net, sk, skb); drop: dst_release(dst); kfree_skb(skb); return err; } static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst_dev(skb); switch (skb->protocol) { case htons(ETH_P_IP): return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, seg6_output_core); case htons(ETH_P_IPV6): return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, seg6_output_core); } return -EINVAL; } static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return seg6_output_nf(net, sk, skb); return seg6_output_core(net, sk, skb); } static int seg6_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1]; struct seg6_iptunnel_encap *tuninfo; struct lwtunnel_state *newts; int tuninfo_len, min_size; struct seg6_lwt *slwt; int err; if (family != AF_INET && family != AF_INET6) return -EINVAL; err = nla_parse_nested_deprecated(tb, SEG6_IPTUNNEL_MAX, nla, seg6_iptunnel_policy, extack); if (err < 0) return err; if (!tb[SEG6_IPTUNNEL_SRH]) return -EINVAL; tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]); tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]); /* tuninfo must contain at least the iptunnel encap structure, * the SRH and one segment */ min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) + sizeof(struct in6_addr); if (tuninfo_len < min_size) return -EINVAL; switch (tuninfo->mode) { case SEG6_IPTUN_MODE_INLINE: if (family != AF_INET6) return -EINVAL; break; case SEG6_IPTUN_MODE_ENCAP: break; case SEG6_IPTUN_MODE_L2ENCAP: break; case SEG6_IPTUN_MODE_ENCAP_RED: break; case SEG6_IPTUN_MODE_L2ENCAP_RED: break; default: return -EINVAL; } /* verify that SRH is consistent */ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false)) return -EINVAL; newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt)); if (!newts) return -ENOMEM; slwt = seg6_lwt_lwtunnel(newts); err = dst_cache_init(&slwt->cache, GFP_ATOMIC); if (err) { kfree(newts); return err; } memcpy(&slwt->tuninfo, tuninfo, tuninfo_len); newts->type = LWTUNNEL_ENCAP_SEG6; newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP) newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; newts->headroom = seg6_lwt_headroom(tuninfo); *ts = newts; return 0; } static void seg6_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache); } static int seg6_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate); if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo)) return -EMSGSIZE; return 0; } static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate) { struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate); return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo)); } static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a); struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b); int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr); if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr)) return 1; return memcmp(a_hdr, b_hdr, len); } static const struct lwtunnel_encap_ops seg6_iptun_ops = { .build_state = seg6_build_state, .destroy_state = seg6_destroy_state, .output = seg6_output, .input = seg6_input, .fill_encap = seg6_fill_encap_info, .get_encap_size = seg6_encap_nlsize, .cmp_encap = seg6_encap_cmp, .owner = THIS_MODULE, }; int __init seg6_iptunnel_init(void) { return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6); } void seg6_iptunnel_exit(void) { lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6); }
129 4899 9047 4891 4896 6 3 794 4896 366 9077 9076 366 4003 3229 947 4003 4300 4311 4305 4304 3237 8105 2868 2866 2868 56 72 127 127 8 945 1244 161 31 25 6 31 31 5142 5133 1928 4343 448 363 3 629 369 21 12 20 21 21 20 12 21 12 20 20 21 20 21 778 6336 82 6177 778 776 5612 5628 5139 4896 4893 4896 4894 777 4898 845 622 358 4894 709 4895 4891 4901 4892 775 775 777 775 778 4894 329 328 331 331 254 153 130 49 133 49 131 132 153 153 151 152 856 15 848 852 15 15 15 845 847 846 844 844 20 19 4302 4305 4890 4890 3549 2852 4898 2888 3234 3556 4899 4847 366 366 365 365 4892 4894 17 17 180 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov * Copyright (C) 2016 Intel, Matthew Wilcox * Copyright (C) 2016 Intel, Ross Zwisler */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/cpu.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/percpu.h> #include <linux/preempt.h> /* in_interrupt() */ #include <linux/radix-tree.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Radix tree node cache. */ struct kmem_cache *radix_tree_node_cachep; /* * The radix tree is variable-height, so an insert operation not only has * to build the branch to its corresponding item, it also has to build the * branch to existing items if the size has to be increased (by * radix_tree_extend). * * The worst case is a zero height tree with just a single item at index 0, * and then inserting an item at index ULONG_MAX. This requires 2 new branches * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared. * Hence: */ #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) /* * The IDR does not have to be as high as the radix tree since it uses * signed integers, not unsigned longs. */ #define IDR_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(int) - 1) #define IDR_MAX_PATH (DIV_ROUND_UP(IDR_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) #define IDR_PRELOAD_SIZE (IDR_MAX_PATH * 2 - 1) /* * Per-cpu pool of preloaded nodes */ DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { .lock = INIT_LOCAL_LOCK(lock), }; EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads); static inline struct radix_tree_node *entry_to_node(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); } static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } #define RADIX_TREE_RETRY XA_RETRY_ENTRY static inline unsigned long get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot) { return parent ? slot - parent->slots : 0; } static unsigned int radix_tree_descend(const struct radix_tree_node *parent, struct radix_tree_node **nodep, unsigned long index) { unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); *nodep = (void *)entry; return offset; } static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, int offset) { __set_bit(offset, node->tags[tag]); } static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, int offset) { __clear_bit(offset, node->tags[tag]); } static inline int tag_get(const struct radix_tree_node *node, unsigned int tag, int offset) { return test_bit(offset, node->tags[tag]); } static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) { root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear_all(struct radix_tree_root *root) { root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1); } static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) { return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT)); } static inline unsigned root_tags_get(const struct radix_tree_root *root) { return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT; } static inline bool is_idr(const struct radix_tree_root *root) { return !!(root->xa_flags & ROOT_IS_IDR); } /* * Returns 1 if any slot in the node has this tag set. * Otherwise returns 0. */ static inline int any_tag_set(const struct radix_tree_node *node, unsigned int tag) { unsigned idx; for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { if (node->tags[tag][idx]) return 1; } return 0; } static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag) { bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE); } /** * radix_tree_find_next_bit - find the next set bit in a memory region * * @node: where to begin the search * @tag: the tag index * @offset: the bitnumber to start searching at * * Unrollable variant of find_next_bit() for constant size arrays. * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero. * Returns next bit offset, or size if nothing found. */ static __always_inline unsigned long radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, unsigned long offset) { const unsigned long *addr = node->tags[tag]; if (offset < RADIX_TREE_MAP_SIZE) { unsigned long tmp; addr += offset / BITS_PER_LONG; tmp = *addr >> (offset % BITS_PER_LONG); if (tmp) return __ffs(tmp) + offset; offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); while (offset < RADIX_TREE_MAP_SIZE) { tmp = *++addr; if (tmp) return __ffs(tmp) + offset; offset += BITS_PER_LONG; } } return RADIX_TREE_MAP_SIZE; } static unsigned int iter_offset(const struct radix_tree_iter *iter) { return iter->index & RADIX_TREE_MAP_MASK; } /* * The maximum index which can be stored in a radix tree */ static inline unsigned long shift_maxindex(unsigned int shift) { return (RADIX_TREE_MAP_SIZE << shift) - 1; } static inline unsigned long node_maxindex(const struct radix_tree_node *node) { return shift_maxindex(node->shift); } static unsigned long next_index(unsigned long index, const struct radix_tree_node *node, unsigned long offset) { return (index & ~node_maxindex(node)) + (offset << node->shift); } /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, struct radix_tree_root *root, unsigned int shift, unsigned int offset, unsigned int count, unsigned int nr_values) { struct radix_tree_node *ret = NULL; /* * Preload code isn't irq safe and it doesn't make sense to use * preloading during an interrupt anyway as all the allocations have * to be atomic. So just do normal allocation when in interrupt. */ if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* * Even if the caller has preloaded, try to allocate from the * cache first for the new node to get accounted to the memory * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask | __GFP_NOWARN); if (ret) goto out; /* * Provided the caller has preloaded here, we will always * succeed in getting a node here (and never reach * kmem_cache_alloc) */ rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes; rtp->nodes = ret->parent; rtp->nr--; } /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(ret); goto out; } ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); if (ret) { ret->shift = shift; ret->offset = offset; ret->count = count; ret->nr_values = nr_values; ret->parent = parent; ret->array = root; } return ret; } void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); /* * Must only free zeroed nodes into the slab. We can be left with * non-NULL entries by radix_tree_free_nodes, so clear the entries * and tags here. */ memset(node->slots, 0, sizeof(node->slots)); memset(node->tags, 0, sizeof(node->tags)); INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } static inline void radix_tree_node_free(struct radix_tree_node *node) { call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; /* * Nodes preloaded by one cgroup can be used by another cgroup, so * they should never be accounted to any particular memory cgroup. */ gfp_mask &= ~__GFP_ACCOUNT; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { local_unlock(&radix_tree_preloads.lock); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < nr) { node->parent = rtp->nodes; rtp->nodes = node; rtp->nr++; } else { kmem_cache_free(radix_tree_node_cachep, node); } } ret = 0; out: return ret; } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); /* * The same as above function, except we don't guarantee preloading happens. * We do it, if we decide it helps. On success, return zero with preemption * disabled. On error, return -ENOMEM with preemption not disabled. */ int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ local_lock(&radix_tree_preloads.lock); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); static unsigned radix_tree_load_root(const struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); *nodep = node; if (likely(radix_tree_is_internal_node(node))) { node = entry_to_node(node); *maxindex = node_maxindex(node); return node->shift + RADIX_TREE_MAP_SHIFT; } *maxindex = 0; return 0; } /* * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, unsigned long index, unsigned int shift) { void *entry; unsigned int maxshift; int tag; /* Figure out what the shift should be. */ maxshift = shift; while (index > shift_maxindex(maxshift)) maxshift += RADIX_TREE_MAP_SHIFT; entry = rcu_dereference_raw(root->xa_head); if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE))) goto out; do { struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL, root, shift, 0, 1, 0); if (!node) return -ENOMEM; if (is_idr(root)) { all_tag_set(node, IDR_FREE); if (!root_tag_get(root, IDR_FREE)) { tag_clear(node, IDR_FREE, 0); root_tag_set(root, IDR_FREE); } } else { /* Propagate the aggregated tag info to the new child */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag)) tag_set(node, tag, 0); } } BUG_ON(shift > BITS_PER_LONG); if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; } else if (xa_is_value(entry)) { /* Moving a value entry root->xa_head to a node */ node->nr_values = 1; } /* * entry was already in the radix tree, so we do not need * rcu_assign_pointer here */ node->slots[0] = (void __rcu *)entry; entry = node_to_entry(node); rcu_assign_pointer(root->xa_head, entry); shift += RADIX_TREE_MAP_SHIFT; } while (shift <= maxshift); out: return maxshift + RADIX_TREE_MAP_SHIFT; } /** * radix_tree_shrink - shrink radix tree to minimum height * @root: radix tree root */ static inline bool radix_tree_shrink(struct radix_tree_root *root) { bool shrunk = false; for (;;) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); struct radix_tree_node *child; if (!radix_tree_is_internal_node(node)) break; node = entry_to_node(node); /* * The candidate node has more than one child, or its child * is not at the leftmost slot, we cannot shrink. */ if (node->count != 1) break; child = rcu_dereference_raw(node->slots[0]); if (!child) break; /* * For an IDR, we must not shrink entry 0 into the root in * case somebody calls idr_replace() with a pointer that * appears to be an internal entry */ if (!node->shift && is_idr(root)) break; if (radix_tree_is_internal_node(child)) entry_to_node(child)->parent = NULL; /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it * (node->slots[0]), it will be safe to dereference the new * one (root->xa_head) as far as dependent read barriers go. */ root->xa_head = (void __rcu *)child; if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) root_tag_clear(root, IDR_FREE); /* * We have a dilemma here. The node's slot[0] must not be * NULLed in case there are concurrent lookups expecting to * find the item. However if this was a bottom-level node, * then it may be subject to the slot pointer being visible * to callers dereferencing it. If item corresponding to * slot[0] is subsequently deleted, these callers would expect * their slot to become empty sooner or later. * * For example, lockless pagecache will look up a slot, deref * the page pointer, and if the page has 0 refcount it means it * was concurrently deleted from pagecache so try the deref * again. Fortunately there is already a requirement for logic * to retry the entire slot lookup -- the indirect pointer * problem (replacing direct root node with an indirect pointer * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ node->count = 0; if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); shrunk = true; } return shrunk; } static bool delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { bool deleted = false; do { struct radix_tree_node *parent; if (node->count) { if (node_to_entry(node) == rcu_dereference_raw(root->xa_head)) deleted |= radix_tree_shrink(root); return deleted; } parent = node->parent; if (parent) { parent->slots[node->offset] = NULL; parent->count--; } else { /* * Shouldn't the tags already have all been cleared * by the caller? */ if (!is_idr(root)) root_tag_clear_all(root); root->xa_head = NULL; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); deleted = true; node = parent; } while (node); return deleted; } /** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Create, if necessary, and return the node and slot for an item * at position @index in the radix tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. * * Returns -ENOMEM, or 0 for success. */ static int __radix_tree_create(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex; unsigned int shift, offset = 0; unsigned long max = index; gfp_t gfp = root_gfp_mask(root); shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ if (max > maxindex) { int error = radix_tree_extend(root, gfp, max, shift); if (error < 0) return error; shift = error; child = rcu_dereference_raw(root->xa_head); } while (shift > 0) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return -ENOMEM; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; /* Go a level down */ node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); slot = &node->slots[offset]; } if (nodep) *nodep = node; if (slotp) *slotp = slot; return 0; } /* * Free any nodes below this node. The tree is presumed to not need * shrinking, and any user data in the tree is presumed to not need a * destructor called on it. If we need to add a destructor, we can * add that functionality later. Note that we may not clear tags or * slots from the tree as an RCU walker may still have a pointer into * this subtree. We could replace the entries with RADIX_TREE_RETRY, * but we'll still have to clear those in rcu_free. */ static void radix_tree_free_nodes(struct radix_tree_node *node) { unsigned offset = 0; struct radix_tree_node *child = entry_to_node(node); for (;;) { void *entry = rcu_dereference_raw(child->slots[offset]); if (xa_is_node(entry) && child->shift) { child = entry_to_node(entry); offset = 0; continue; } offset++; while (offset == RADIX_TREE_MAP_SIZE) { struct radix_tree_node *old = child; offset = child->offset + 1; child = child->parent; WARN_ON_ONCE(!list_empty(&old->private_list)); radix_tree_node_free(old); if (old == entry_to_node(node)) return; } } } static inline int insert_entries(struct radix_tree_node *node, void __rcu **slot, void *item) { if (*slot) return -EEXIST; rcu_assign_pointer(*slot, item); if (node) { node->count++; if (xa_is_value(item)) node->nr_values++; } return 1; } /** * radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key * @item: item to insert * * Insert an item into the radix tree at position @index. */ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; void __rcu **slot; int error; BUG_ON(radix_tree_is_internal_node(item)); error = __radix_tree_create(root, index, &node, &slot); if (error) return error; error = insert_entries(node, slot, item); if (error < 0) return error; if (node) { unsigned offset = get_slot_offset(node, slot); BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); } else { BUG_ON(root_tags_get(root)); } return 0; } EXPORT_SYMBOL(radix_tree_insert); /** * __radix_tree_lookup - lookup an item in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Lookup and return the item at position @index in the radix * tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. */ void *__radix_tree_lookup(const struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node, *parent; unsigned long maxindex; void __rcu **slot; restart: parent = NULL; slot = (void __rcu **)&root->xa_head; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); slot = parent->slots + offset; if (node == RADIX_TREE_RETRY) goto restart; if (parent->shift == 0) break; } if (nodep) *nodep = parent; if (slotp) *slotp = slot; return node; } /** * radix_tree_lookup_slot - lookup a slot in a radix tree * @root: radix tree root * @index: index key * * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * * This function can be called under rcu_read_lock iff the slot is not * modified by radix_tree_replace_slot, otherwise it must be called * exclusive from other writers. Any dereference of the slot must be done * using radix_tree_deref_slot. */ void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root, unsigned long index) { void __rcu **slot; if (!__radix_tree_lookup(root, index, NULL, &slot)) return NULL; return slot; } EXPORT_SYMBOL(radix_tree_lookup_slot); /** * radix_tree_lookup - perform lookup operation on a radix tree * @root: radix tree root * @index: index key * * Lookup the item at the position @index in the radix tree @root. * * This function can be called under rcu_read_lock, however the caller * must manage lifetimes of leaf nodes (eg. RCU may also be used to free * them safely). No RCU barriers are required to access or modify the * returned item, however. */ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) { return __radix_tree_lookup(root, index, NULL, NULL); } EXPORT_SYMBOL(radix_tree_lookup); static void replace_slot(void __rcu **slot, void *item, struct radix_tree_node *node, int count, int values) { if (node && (count || values)) { node->count += count; node->nr_values += values; } rcu_assign_pointer(*slot, item); } static bool node_tag_get(const struct radix_tree_root *root, const struct radix_tree_node *node, unsigned int tag, unsigned int offset) { if (node) return tag_get(node, tag, offset); return root_tag_get(root, tag); } /* * IDR users want to be able to store NULL in the tree, so if the slot isn't * free, don't adjust the count, even if it's transitioning between NULL and * non-NULL. For the IDA, we mark slots as being IDR_FREE while they still * have empty bits, but it only stores NULL in slots when they're being * deleted. */ static int calculate_count(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item, void *old) { if (is_idr(root)) { unsigned offset = get_slot_offset(node, slot); bool free = node_tag_get(root, node, IDR_FREE, offset); if (!free) return 0; if (!old) return 1; } return !!item - !!old; } /** * __radix_tree_replace - replace item in a slot * @root: radix tree root * @node: pointer to tree node * @slot: pointer to slot in @node * @item: new item to store in the slot. * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item) { void *old = rcu_dereference_raw(*slot); int values = !!xa_is_value(item) - !!xa_is_value(old); int count = calculate_count(root, node, slot, item, old); /* * This function supports replacing value entries and * deleting entries, but that needs accounting against the * node unless the slot is root->xa_head. */ WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) && (count || values)); replace_slot(slot, item, node, count, values); if (!node) return; delete_node(root, node); } /** * radix_tree_replace_slot - replace item in a slot * @root: radix tree root * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_lookup_slot() and * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked * across slot lookup and replacement. * * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and value entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or * deleting, use __radix_tree_lookup() and __radix_tree_replace() or * radix_tree_iter_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { __radix_tree_replace(root, NULL, slot, item); } EXPORT_SYMBOL(radix_tree_replace_slot); /** * radix_tree_iter_replace - replace item in a slot * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_for_each_slot(). * Caller must hold tree write locked. */ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { __radix_tree_replace(root, iter->node, slot, item); } static void node_tag_set(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (tag_get(node, tag, offset)) return; tag_set(node, tag, offset); offset = node->offset; node = node->parent; } if (!root_tag_get(root, tag)) root_tag_set(root, tag); } /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. From * the root all the way down to the leaf node. * * Returns the address of the tagged item. Setting a tag on a not-present * item is a bug. */ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; radix_tree_load_root(root, &node, &maxindex); BUG_ON(index > maxindex); while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); BUG_ON(!node); if (!tag_get(parent, tag, offset)) tag_set(parent, tag, offset); } /* set the root's tag bit */ if (!root_tag_get(root, tag)) root_tag_set(root, tag); return node; } EXPORT_SYMBOL(radix_tree_tag_set); static void node_tag_clear(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (!tag_get(node, tag, offset)) return; tag_clear(node, tag, offset); if (any_tag_set(node, tag)) return; offset = node->offset; node = node->parent; } /* clear the root's tag bit */ if (root_tag_get(root, tag)) root_tag_clear(root, tag); } /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. If this causes * the leaf node to have no tags set then clear the tag in the * next-to-leaf node, etc. * * Returns the address of the tagged item on success, else NULL. ie: * has the same return value and semantics as radix_tree_lookup(). */ void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; int offset = 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; parent = NULL; while (radix_tree_is_internal_node(node)) { parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); } if (node) node_tag_clear(root, parent, tag, offset); return node; } EXPORT_SYMBOL(radix_tree_tag_clear); /** * radix_tree_iter_tag_clear - clear a tag on the current iterator entry * @root: radix tree root * @iter: iterator state * @tag: tag to clear */ void radix_tree_iter_tag_clear(struct radix_tree_root *root, const struct radix_tree_iter *iter, unsigned int tag) { node_tag_clear(root, iter->node, tag, iter_offset(iter)); } /** * radix_tree_tag_get - get a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index (< RADIX_TREE_MAX_TAGS) * * Return values: * * 0: tag not present or not set * 1: tag set * * Note that the return value of this function may not be relied on, even if * the RCU lock is held, unless tag modification and node deletion are excluded * from concurrency. */ int radix_tree_tag_get(const struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; if (!root_tag_get(root, tag)) return 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return 0; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); if (!tag_get(parent, tag, offset)) return 0; if (node == RADIX_TREE_RETRY) break; } return 1; } EXPORT_SYMBOL(radix_tree_tag_get); /* Construct iter->tags bit-mask from node->tags[tag] array */ static void set_iter_tags(struct radix_tree_iter *iter, struct radix_tree_node *node, unsigned offset, unsigned tag) { unsigned tag_long = offset / BITS_PER_LONG; unsigned tag_bit = offset % BITS_PER_LONG; if (!node) { iter->tags = 1; return; } iter->tags = node->tags[tag][tag_long] >> tag_bit; /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ if (tag_long < RADIX_TREE_TAG_LONGS - 1) { /* Pick tags from next element */ if (tag_bit) iter->tags |= node->tags[tag][tag_long + 1] << (BITS_PER_LONG - tag_bit); /* Clip chunk size, here only BITS_PER_LONG tags */ iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); } } void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { iter->index = __radix_tree_iter_add(iter, 1); iter->next_index = iter->index; iter->tags = 0; return NULL; } EXPORT_SYMBOL(radix_tree_iter_resume); /** * radix_tree_next_chunk - find next chunk of slots for iteration * * @root: radix tree root * @iter: iterator state * @flags: RADIX_TREE_ITER_* flags and tag index * Returns: pointer to chunk first slot, or NULL if iteration is over */ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags) { unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; struct radix_tree_node *node, *child; unsigned long index, offset, maxindex; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; /* * Catch next_index overflow after ~0UL. iter->index never overflows * during iterating; it can be zero only at the beginning. * And we cannot overflow iter->next_index in a single step, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. * * This condition also used by radix_tree_next_slot() to stop * contiguous iterating, and forbid switching to the next chunk. */ index = iter->next_index; if (!index && iter->index) return NULL; restart: radix_tree_load_root(root, &child, &maxindex); if (index > maxindex) return NULL; if (!child) return NULL; if (!radix_tree_is_internal_node(child)) { /* Single-slot tree */ iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; iter->node = NULL; return (void __rcu **)&root->xa_head; } do { node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); if ((flags & RADIX_TREE_ITER_TAGGED) ? !tag_get(node, tag, offset) : !child) { /* Hole detected */ if (flags & RADIX_TREE_ITER_CONTIG) return NULL; if (flags & RADIX_TREE_ITER_TAGGED) offset = radix_tree_find_next_bit(node, tag, offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw( node->slots[offset]); if (slot) break; } index &= ~node_maxindex(node); index += offset << node->shift; /* Overflow after ~0UL */ if (!index) return NULL; if (offset == RADIX_TREE_MAP_SIZE) goto restart; child = rcu_dereference_raw(node->slots[offset]); } if (!child) goto restart; if (child == RADIX_TREE_RETRY) break; } while (node->shift && radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | offset; iter->next_index = (index | node_maxindex(node)) + 1; iter->node = node; if (flags & RADIX_TREE_ITER_TAGGED) set_iter_tags(iter, node, offset, tag); return node->slots + offset; } EXPORT_SYMBOL(radix_tree_next_chunk); /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * them at *@results and returns the number of items which were placed at * *@results. * * The implementation is naive. * * Like radix_tree_lookup, radix_tree_gang_lookup may be called under * rcu_read_lock. In this case, rather than the returned results being * an atomic snapshot of the tree at a single point in time, the * semantics of an RCU protected gang lookup are as though multiple * radix_tree_lookups have been issued in individual locks, and results * stored in 'results'. */ unsigned int radix_tree_gang_lookup(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_slot(slot, root, &iter, first_index) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the items at *@results and * returns the number of items which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag); /** * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a * radix tree based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the slots at *@results and * returns the number of slots which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root, void __rcu ***results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = slot; if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { void *old = rcu_dereference_raw(*slot); int values = xa_is_value(old) ? -1 : 0; unsigned offset = get_slot_offset(node, slot); int tag; if (is_idr(root)) node_tag_set(root, node, IDR_FREE, offset); else for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); replace_slot(slot, NULL, node, -1, values); return node && delete_node(root, node); } /** * radix_tree_iter_delete - delete the entry at this iterator position * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * * Delete the entry at the position currently pointed to by the iterator. * This may result in the current node being freed; if it is, the iterator * is advanced so that it will not reference the freed memory. This * function may be called without any locking if there are no other threads * which can access this tree. */ void radix_tree_iter_delete(struct radix_tree_root *root, struct radix_tree_iter *iter, void __rcu **slot) { if (__radix_tree_delete(root, iter->node, slot)) iter->index = iter->next_index; } EXPORT_SYMBOL(radix_tree_iter_delete); /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root * @index: index key * @item: expected item * * Remove @item at @index from the radix tree rooted at @root. * * Return: the deleted entry, or %NULL if it was not present * or the entry at the given @index was not @item. */ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node = NULL; void __rcu **slot = NULL; void *entry; entry = __radix_tree_lookup(root, index, &node, &slot); if (!slot) return NULL; if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE, get_slot_offset(node, slot)))) return NULL; if (item && entry != item) return NULL; __radix_tree_delete(root, node, slot); return entry; } EXPORT_SYMBOL(radix_tree_delete_item); /** * radix_tree_delete - delete an entry from a radix tree * @root: radix tree root * @index: index key * * Remove the entry at @index from the radix tree rooted at @root. * * Return: The deleted entry, or %NULL if it was not present. */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { return radix_tree_delete_item(root, index, NULL); } EXPORT_SYMBOL(radix_tree_delete); /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root * @tag: tag to test */ int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag) { return root_tag_get(root, tag); } EXPORT_SYMBOL(radix_tree_tagged); /** * idr_preload - preload for idr_alloc() * @gfp_mask: allocation mask to use for preloading * * Preallocate memory to use for the next call to idr_alloc(). This function * returns with preemption disabled. It will be enabled by idr_preload_end(). */ void idr_preload(gfp_t gfp_mask) { if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) local_lock(&radix_tree_preloads.lock); } EXPORT_SYMBOL(idr_preload); void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex, start = iter->next_index; unsigned int shift, offset = 0; grow: shift = radix_tree_load_root(root, &child, &maxindex); if (!radix_tree_tagged(root, IDR_FREE)) start = max(start, maxindex + 1); if (start > max) return ERR_PTR(-ENOSPC); if (start > maxindex) { int error = radix_tree_extend(root, gfp, start, shift); if (error < 0) return ERR_PTR(error); shift = error; child = rcu_dereference_raw(root->xa_head); } if (start == 0 && shift == 0) shift = RADIX_TREE_MAP_SHIFT; while (shift) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return ERR_PTR(-ENOMEM); all_tag_set(child, IDR_FREE); rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; node = entry_to_node(child); offset = radix_tree_descend(node, &child, start); if (!tag_get(node, IDR_FREE, offset)) { offset = radix_tree_find_next_bit(node, IDR_FREE, offset + 1); start = next_index(start, node, offset); if (start > max || start == 0) return ERR_PTR(-ENOSPC); while (offset == RADIX_TREE_MAP_SIZE) { offset = node->offset + 1; node = node->parent; if (!node) goto grow; shift = node->shift; } child = rcu_dereference_raw(node->slots[offset]); } slot = &node->slots[offset]; } iter->index = start; if (node) iter->next_index = 1 + min(max, (start | node_maxindex(node))); else iter->next_index = 1; iter->node = node; set_iter_tags(iter, node, offset, IDR_FREE); return slot; } /** * idr_destroy - release all internal memory from an IDR * @idr: idr handle * * After this function is called, the IDR is empty, and may be reused or * the data structure containing it may be freed. * * A typical clean-up sequence for objects stored in an idr tree will use * idr_for_each() to free all objects, if necessary, then idr_destroy() to * free the memory used to keep track of those objects. */ void idr_destroy(struct idr *idr) { struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head); if (radix_tree_is_internal_node(node)) radix_tree_free_nodes(node); idr->idr_rt.xa_head = NULL; root_tag_set(&idr->idr_rt, IDR_FREE); } EXPORT_SYMBOL(idr_destroy); static void radix_tree_node_ctor(void *arg) { struct radix_tree_node *node = arg; memset(node, 0, sizeof(*node)); INIT_LIST_HEAD(&node->private_list); } static int radix_tree_cpu_dead(unsigned int cpu) { struct radix_tree_preload *rtp; struct radix_tree_node *node; /* Free per-cpu pool of preloaded nodes */ rtp = &per_cpu(radix_tree_preloads, cpu); while (rtp->nr) { node = rtp->nodes; rtp->nodes = node->parent; kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; } return 0; } void __init radix_tree_init(void) { int ret; BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); BUILD_BUG_ON(XA_CHUNK_SIZE > 255); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", NULL, radix_tree_cpu_dead); WARN_ON(ret < 0); }
1838 119 1739 1732 1723 1691 311 86 71 14 18 136 271 3 36 236 26 10 16 670 399 281 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 // SPDX-License-Identifier: GPL-2.0-only /* * Access kernel or user memory without faulting. */ #include <linux/export.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <asm/tlb.h> bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return true; } /* * The below only uses kmsan_check_memory() to ensure uninitialized kernel * memory isn't leaked. */ #define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __get_kernel_nofault(dst, src, type, err_label); \ kmsan_check_memory(src, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { unsigned long align = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) align = (unsigned long)dst | (unsigned long)src; if (!copy_from_kernel_nofault_allowed(src, size)) return -ERANGE; pagefault_disable(); if (!(align & 7)) copy_from_kernel_nofault_loop(dst, src, size, u64, Efault); if (!(align & 3)) copy_from_kernel_nofault_loop(dst, src, size, u32, Efault); if (!(align & 1)) copy_from_kernel_nofault_loop(dst, src, size, u16, Efault); copy_from_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); #define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __put_kernel_nofault(dst, src, type, err_label); \ instrument_write(dst, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } long copy_to_kernel_nofault(void *dst, const void *src, size_t size) { unsigned long align = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) align = (unsigned long)dst | (unsigned long)src; pagefault_disable(); if (!(align & 7)) copy_to_kernel_nofault_loop(dst, src, size, u64, Efault); if (!(align & 3)) copy_to_kernel_nofault_loop(dst, src, size, u32, Efault); if (!(align & 1)) copy_to_kernel_nofault_loop(dst, src, size, u16, Efault); copy_to_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { const void *src = unsafe_addr; if (unlikely(count <= 0)) return 0; if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) return -ERANGE; pagefault_disable(); do { __get_kernel_nofault(dst, src, u8, Efault); dst++; src++; } while (dst[-1] && src - unsafe_addr < count); pagefault_enable(); dst[-1] = '\0'; return src - unsafe_addr; Efault: pagefault_enable(); dst[0] = '\0'; return -EFAULT; } /** * copy_from_user_nofault(): safely attempt to read from a user-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from. This must be a user address. * @size: size of the data chunk * * Safely read from user address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; if (!__access_ok(src, size)) return ret; if (!nmi_uaccess_okay()) return ret; pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); if (ret) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(copy_from_user_nofault); /** * copy_to_user_nofault(): safely attempt to write to a user-space location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk * * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } if (ret) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(copy_to_user_nofault); /** * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user * address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @unsafe_addr: Unsafe user address. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from unsafe user address to kernel buffer. * * On success, returns the length of the string INCLUDING the trailing NUL. * * If access fails, returns -EFAULT (some data may have been copied * and the trailing NUL added). * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. */ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { long ret; if (unlikely(count <= 0)) return 0; pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); if (ret >= count) { ret = count; dst[ret - 1] = '\0'; } else if (ret >= 0) { ret++; } return ret; } /** * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL. * @unsafe_addr: The string to measure. * @count: Maximum count (including NUL) * * Get the size of a NUL-terminated string in user space without pagefault. * * Returns the size of the string INCLUDING the terminating NUL. * * If the string is too long, returns a number larger than @count. User * has to check the return value against "> count". * On exception (or invalid count), returns 0. * * Unlike strnlen_user, this can be used from IRQ handler etc. because * it disables pagefaults. */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { int ret; pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); return ret; } void __copy_overflow(int size, unsigned long count) { WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } EXPORT_SYMBOL(__copy_overflow);
38 38 35 2 17 16 30 30 2 16 15 34 34 58 89 89 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder. * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * * Fixes: * Alan Cox : Merged and made usable non modular (its so tiny its silly as * a module taking up 2 pages). * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) * to keep ip_forward happy. * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL * David Woodhouse : Perform some basic ICMP handling. * IPIP Routing without decapsulation. * Carlos Picoto : GRE over IP support * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. * I do not want to merge them together. */ /* tunnel.c: an IP tunnel driver The purpose of this driver is to provide an IP tunnel through which you can tunnel network traffic transparently across subnets. This was written by looking at Nick Holloway's dummy driver Thanks for the great code! -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 Minor tweaks: Cleaned up the code a little and added some pre-1.3.0 tweaks. dev->hard_header/hard_header_len changed to use no headers. Comments/bracketing tweaked. Made the tunnels use dev->name not tunnel: when error reporting. Added tx_dropped stat -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 Reworked: Changed to tunnel to destination gateway in addition to the tunnel's pointopoint address Almost completely rewritten Note: There is currently no firewall or ICMP handling done. -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 */ /* Things I wish I had known when writing the tunnel driver: When the tunnel_xmit() function is called, the skb contains the packet to be sent (plus a great deal of extra info), and dev contains the tunnel device that _we_ are. When we are passed a packet, we are expected to fill in the source address with our source IP address. What is the proper way to allocate, copy and free a buffer? After you allocate it, it is a "0 length" chunk of memory starting at zero. If you want to add headers to the buffer later, you'll have to call "skb_reserve(skb, amount)" with the amount of memory you want reserved. Then, you call "skb_put(skb, amount)" with the amount of space you want in the buffer. skb_put() returns a pointer to the top (#0) of that buffer. skb->len is set to the amount of space you have "allocated" with skb_put(). You can then write up to skb->len bytes to that buffer. If you need more, you can call skb_put() again with the additional amount of space you need. You can find out how much more space you can allocate by calling "skb_tailroom(skb)". Now, to add header space, call "skb_push(skb, header_len)". This creates space at the beginning of the buffer and returns a pointer to this new space. If later you need to strip a header from a buffer, call "skb_pull(skb, header_len)". skb_headroom() will return how much space is left at the top of the buffer (before the main data). Remember, this headroom space must be reserved before the skb_put() function is called. */ /* This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst_metadata.h> static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static unsigned int ipip_net_id __read_mostly; static int ipip_tunnel_init(struct net_device *dev); static struct rtnl_link_ops ipip_link_ops __read_mostly; static int ipip_err(struct sk_buff *skb, u32 info) { /* All the routers (except for Linux) return only * 8 bytes of packet payload. It means, that precise relaying of * ICMP in the real Internet is absolutely infeasible. */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err = 0; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!t) { err = -ENOENT; goto out; } switch (type) { case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: /* Impossible event. */ goto out; default: /* All others are translated to HOST_UNREACH. * rfc2003 contains "deep thoughts" about NET_UNREACH, * I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) goto out; break; case ICMP_REDIRECT: break; default: goto out; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, net, t->parms.link, iph->protocol); goto out; } if (t->parms.iph.daddr == 0) { err = -ENOENT; goto out; } if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: return err; } static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; #if IS_ENABLED(CONFIG_MPLS) static const struct tnl_ptk_info mplsip_tpi = { /* no tunnel info required for mplsip. */ .proto = htons(ETH_P_MPLS_UC), }; #endif static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { const struct tnl_ptk_info *tpi; if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; #if IS_ENABLED(CONFIG_MPLS) if (ipproto == IPPROTO_MPLS) tpi = &mplsip_tpi; else #endif tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (tunnel->collect_md) { ip_tunnel_flags_zero(flags); tun_dst = ip_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) return 0; ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info); } skb_reset_mac_header(skb); return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } return -1; drop: kfree_skb(skb); return 0; } static int ipip_rcv(struct sk_buff *skb) { return ipip_tunnel_rcv(skb, IPPROTO_IPIP); } #if IS_ENABLED(CONFIG_MPLS) static int mplsip_rcv(struct sk_buff *skb) { return ipip_tunnel_rcv(skb, IPPROTO_MPLS); } #endif /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; u8 ipproto; if (!pskb_inet_may_pull(skb)) goto tx_error; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; #if IS_ENABLED(CONFIG_MPLS) case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; #endif default: goto tx_error; } if (tiph->protocol != ipproto && tiph->protocol != 0) goto tx_error; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; skb_set_inner_ipproto(skb, ipproto); if (tunnel->collect_md) ip_md_tunnel_xmit(skb, dev, ipproto, 0); else ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) { switch (ipproto) { case 0: case IPPROTO_IPIP: #if IS_ENABLED(CONFIG_MPLS) case IPPROTO_MPLS: #endif return true; } return false; } static int ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; } p->i_key = p->o_key = 0; ip_tunnel_flags_zero(p->i_flags); ip_tunnel_flags_zero(p->o_flags); return ip_tunnel_ctl(dev, p, cmd); } static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, }; #define IPIP_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; ip_tunnel_setup(dev, ipip_net_id); } static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); tunnel->tun_hlen = 0; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; return ip_tunnel_init(dev); } static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0) return -EINVAL; return 0; } static void ipip_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, bool *collect_md, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPIP; parms->iph.ihl = 5; *collect_md = false; if (!data) return; ip_tunnel_netlink_parms(data, parms); if (data[IFLA_IPTUN_COLLECT_METADATA]) *collect_md = true; if (data[IFLA_IPTUN_FWMARK]) *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ipip_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; bool collect_md; __u32 fwmark = t->fwmark; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip_netlink_parms(data, &p, &collect_md, &fwmark); if (collect_md) return -EINVAL; if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t ipip_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(4) + /* IFLA_IPTUN_REMOTE */ nla_total_size(4) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (tunnel->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { .kind = "ipip", .maxtype = IFLA_IPTUN_MAX, .policy = ipip_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip_tunnel_setup, .validate = ipip_tunnel_validate, .newlink = ipip_newlink, .changelink = ipip_changelink, .dellink = ip_tunnel_dellink, .get_size = ipip_get_size, .fill_info = ipip_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, }; #if IS_ENABLED(CONFIG_MPLS) static struct xfrm_tunnel mplsip_handler __read_mostly = { .handler = mplsip_rcv, .err_handler = ipip_err, .priority = 1, }; #endif static int __net_init ipip_init_net(struct net *net) { return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } static void __net_exit ipip_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; static int __init ipip_init(void) { int err; pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&ipip_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); goto xfrm_tunnel_ipip_failed; } #if IS_ENABLED(CONFIG_MPLS) err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); if (err < 0) { pr_info("%s: can't register tunnel\n", __func__); goto xfrm_tunnel_mplsip_failed; } #endif err = rtnl_link_register(&ipip_link_ops); if (err < 0) goto rtnl_link_failed; out: return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mplsip_failed: #endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: unregister_pernet_device(&ipip_net_ops); goto out; } static void __exit ipip_fini(void) { rtnl_link_unregister(&ipip_link_ops); if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) pr_info("%s: can't deregister tunnel\n", __func__); #if IS_ENABLED(CONFIG_MPLS) if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS)) pr_info("%s: can't deregister tunnel\n", __func__); #endif unregister_pernet_device(&ipip_net_ops); } module_init(ipip_init); module_exit(ipip_fini); MODULE_DESCRIPTION("IP/IP protocol decoder library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0");
4 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_meta.h> #include <linux/if_bridge.h> #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ #include "../br_private.h" static const struct net_device * nft_meta_get_bridge(const struct net_device *dev) { if (dev && netif_is_bridge_port(dev)) return netdev_master_upper_dev_get_rcu((struct net_device *)dev); return NULL; } static void nft_meta_bridge_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); u32 *dest = &regs->data[priv->dreg]; const struct net_device *br_dev; switch (priv->key) { case NFT_META_BRI_IIFNAME: br_dev = nft_meta_get_bridge(in); break; case NFT_META_BRI_OIFNAME: br_dev = nft_meta_get_bridge(out); break; case NFT_META_BRI_IIFPVID: { u16 p_pvid; br_dev = nft_meta_get_bridge(in); if (!br_dev || !br_vlan_enabled(br_dev)) goto err; br_vlan_get_pvid_rcu(in, &p_pvid); nft_reg_store16(dest, p_pvid); return; } case NFT_META_BRI_IIFVPROTO: { u16 p_proto; br_dev = nft_meta_get_bridge(in); if (!br_dev || !br_vlan_enabled(br_dev)) goto err; br_vlan_get_proto(br_dev, &p_proto); nft_reg_store_be16(dest, htons(p_proto)); return; } case NFT_META_BRI_IIFHWADDR: br_dev = nft_meta_get_bridge(in); if (!br_dev) goto err; memcpy(dest, br_dev->dev_addr, ETH_ALEN); return; default: return nft_meta_get_eval(expr, regs, pkt); } strscpy_pad((char *)dest, br_dev ? br_dev->name : "", IFNAMSIZ); return; err: regs->verdict.code = NFT_BREAK; } static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_BRI_IIFNAME: case NFT_META_BRI_OIFNAME: len = IFNAMSIZ; break; case NFT_META_BRI_IIFPVID: case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; case NFT_META_BRI_IIFHWADDR: len = ETH_ALEN; break; default: return nft_meta_get_init(ctx, expr, tb); } priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } static struct nft_expr_type nft_meta_bridge_type; static const struct nft_expr_ops nft_meta_bridge_get_ops = { .type = &nft_meta_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_bridge_get_eval, .init = nft_meta_bridge_get_init, .dump = nft_meta_get_dump, .reduce = nft_meta_get_reduce, }; static void nft_meta_bridge_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); u32 *sreg = &regs->data[meta->sreg]; struct sk_buff *skb = pkt->skb; u8 value8; switch (meta->key) { case NFT_META_BRI_BROUTE: value8 = nft_reg_load8(sreg); BR_INPUT_SKB_CB(skb)->br_netfilter_broute = !!value8; break; default: nft_meta_set_eval(expr, regs, pkt); } } static int nft_meta_bridge_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_BRI_BROUTE: len = sizeof(u8); break; default: return nft_meta_set_init(ctx, expr, tb); } priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; return 0; } static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { int i; for (i = 0; i < NFT_REG32_NUM; i++) { if (!track->regs[i].selector) continue; if (track->regs[i].selector->ops != &nft_meta_bridge_get_ops) continue; __nft_reg_track_cancel(track, i); } return false; } static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; switch (priv->key) { case NFT_META_BRI_BROUTE: case NFT_META_BRI_IIFHWADDR: hooks = 1 << NF_BR_PRE_ROUTING; break; default: return nft_meta_set_validate(ctx, expr); } return nft_chain_validate_hooks(ctx->chain, hooks); } static const struct nft_expr_ops nft_meta_bridge_set_ops = { .type = &nft_meta_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_bridge_set_eval, .init = nft_meta_bridge_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, .reduce = nft_meta_bridge_set_reduce, .validate = nft_meta_bridge_set_validate, }; static const struct nft_expr_ops * nft_meta_bridge_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_META_KEY] == NULL) return ERR_PTR(-EINVAL); if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) return ERR_PTR(-EINVAL); if (tb[NFTA_META_DREG]) return &nft_meta_bridge_get_ops; if (tb[NFTA_META_SREG]) return &nft_meta_bridge_set_ops; return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_meta_bridge_type __read_mostly = { .family = NFPROTO_BRIDGE, .name = "meta", .select_ops = nft_meta_bridge_select_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, }; static int __init nft_meta_bridge_module_init(void) { return nft_register_expr(&nft_meta_bridge_type); } static void __exit nft_meta_bridge_module_exit(void) { nft_unregister_expr(&nft_meta_bridge_type); } module_init(nft_meta_bridge_module_init); module_exit(nft_meta_bridge_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("wenxu <wenxu@ucloud.cn>"); MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); MODULE_DESCRIPTION("Support for bridge dedicated meta key");
35 36 25 34 49 50 40 28 29 37 38 38 38 24 32 32 32 32 2 2 2 2 2 12 12 12 12 180 181 21 21 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/dst_cache.c - dst entry cache * * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> */ #include <linux/kernel.h> #include <linux/percpu.h> #include <net/dst_cache.h> #include <net/route.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_fib.h> #endif #include <uapi/linux/in.h> struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; local_lock_t bh_lock; u32 cookie; union { struct in_addr in_saddr; struct in6_addr in6_saddr; }; }; static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, struct dst_entry *dst, u32 cookie) { DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst_release(dst_cache->dst); if (dst) dst_hold(dst); dst_cache->cookie = cookie; dst_cache->dst = dst; } static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, struct dst_cache_pcpu *idst) { struct dst_entry *dst; DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst = idst->dst; if (!dst) goto fail; /* the cache already hold a dst reference; it can't go away */ dst_hold(dst); if (unlikely(!time_after(idst->refresh_ts, READ_ONCE(dst_cache->reset_ts)) || (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); goto fail; } return dst; fail: idst->refresh_ts = jiffies; return NULL; } struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { struct dst_entry *dst; if (!dst_cache->cache) return NULL; local_lock_nested_bh(&dst_cache->cache->bh_lock); dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get); struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) { struct dst_cache_pcpu *idst; struct dst_entry *dst; if (!dst_cache->cache) return NULL; local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); if (!dst) { local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; } *saddr = idst->in_saddr.s_addr; local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, __be32 saddr) { struct dst_cache_pcpu *idst; if (!dst_cache->cache) return; local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); #if IS_ENABLED(CONFIG_IPV6) void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, const struct in6_addr *saddr) { struct dst_cache_pcpu *idst; if (!dst_cache->cache) return; local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, struct in6_addr *saddr) { struct dst_cache_pcpu *idst; struct dst_entry *dst; if (!dst_cache->cache) return NULL; local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); if (!dst) { local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; } *saddr = idst->in6_saddr; local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); #endif int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { unsigned int i; dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; for_each_possible_cpu(i) local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); dst_cache_reset(dst_cache); return 0; } EXPORT_SYMBOL_GPL(dst_cache_init); void dst_cache_destroy(struct dst_cache *dst_cache) { int i; if (!dst_cache->cache) return; for_each_possible_cpu(i) dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); free_percpu(dst_cache->cache); } EXPORT_SYMBOL_GPL(dst_cache_destroy); void dst_cache_reset_now(struct dst_cache *dst_cache) { int i; if (!dst_cache->cache) return; dst_cache_reset(dst_cache); for_each_possible_cpu(i) { struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); struct dst_entry *dst = idst->dst; idst->cookie = 0; idst->dst = NULL; dst_release(dst); } } EXPORT_SYMBOL_GPL(dst_cache_reset_now);
17707 17796 17845 18204 18253 196 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_CONTEXT_H #define __X86_KERNEL_FPU_CONTEXT_H #include <asm/fpu/xstate.h> #include <asm/trace/fpu.h> /* Functions related to FPU context tracking */ /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Invalidate a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } static inline void fpregs_deactivate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { struct fpu *fpu = x86_task_fpu(current); int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { /* * This restores _all_ xstate which has not been * established yet. * * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu->xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); fpregs_activate(fpu); fpu->last_cpu = cpu; } clear_thread_flag(TIF_NEED_FPU_LOAD); } #endif
52 18281 2998 3004 31 226 3354 4326 346 7 362 11 3 118 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/nodemask.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/notifier.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> #include <linux/cgroup_namespace.h> struct kernel_clone_args; /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 #ifdef CONFIG_CGROUPS enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; enum cgroup_lifetime_events { CGROUP_LIFETIME_ONLINE, CGROUP_LIFETIME_OFFLINE, }; extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; extern spinlock_t css_set_lock; extern struct blocking_notifier_head cgroup_lifetime_notifier; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ #ifdef CONFIG_DEBUG_CGROUP_REF void css_get(struct cgroup_subsys_state *css); void css_get_many(struct cgroup_subsys_state *css, unsigned int n); bool css_tryget(struct cgroup_subsys_state *css); bool css_tryget_online(struct cgroup_subsys_state *css); void css_put(struct cgroup_subsys_state *css); void css_put_many(struct cgroup_subsys_state *css, unsigned int n); #else #define CGROUP_REF_FN_ATTRS static inline #define CGROUP_REF_EXPORT(fn) #include <linux/cgroup_refcnt.h> #endif static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return css->flags & CSS_DYING; } static inline bool css_is_online(struct cgroup_subsys_state *css) { return css->flags & CSS_ONLINE; } static inline bool css_is_self(struct cgroup_subsys_state *css) { if (css == &css->cgroup->self) { /* cgroup::self should not have subsystem association */ WARN_ON(css->ss != NULL); return true; } return false; } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } extern struct mutex cgroup_mutex; static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } static inline void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css; rcu_read_lock(); while (true) { css = task_css(task, subsys_id); /* * Can't use css_tryget_online() here. A task which has * PF_EXITING set may stay associated with an offline css. * If such task calls this function, css_tryget_online() * will keep failing. */ if (likely(css_tryget(css))) break; cpu_relax(); } rcu_read_unlock(); return css; } /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID * * Test whether @task belongs to the root css on the specified subsystem. * May be invoked in any context. */ static inline bool task_css_is_root(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, true) == init_css_set.subsys[subsys_id]; } static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { return task_css(task, subsys_id)->cgroup; } static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) { return task_css_set(task)->dfl_cgrp; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { struct cgroup_subsys_state *parent_css = cgrp->self.parent; if (parent_css) return container_of(parent_css, struct cgroup, self); return NULL; } /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested * @ancestor: possible ancestor of @cgrp * * Test whether @cgrp is a descendant of @ancestor. It also returns %true * if @cgrp == @ancestor. This function is safe to call as long as @cgrp * and @ancestor are accessible. */ static inline bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; return cgrp->ancestors[ancestor->level] == ancestor; } /** * cgroup_ancestor - find ancestor of cgroup * @cgrp: cgroup to find ancestor of * @ancestor_level: level of ancestor to find starting from root * * Find ancestor of cgroup at specified level starting from root if it exists * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at * @ancestor_level. * * This function is safe to call as long as @cgrp is accessible. */ static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { if (ancestor_level < 0 || ancestor_level > cgrp->level) return NULL; return cgrp->ancestors[ancestor_level]; } /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { struct css_set *cset = task_css_set(task); return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ static inline struct cftype *of_cft(struct kernfs_open_file *of) { return of->kn->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); /* cft/css accessors for cftype->seq_*() operations */ static inline struct cftype *seq_cft(struct seq_file *seq) { return of_cft(seq->private); } static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { return of_css(seq->private); } /* * Name / path handling functions. All are thin wrappers around the kernfs * counterparts and can be called under any context. */ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_name(cgrp->kn, buf, buflen); } static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { pr_cont_kernfs_path(cgrp->kn); } bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) { /* * kthreadd is inherited by all kthreads, keep it in the root so * that the new kthreads are guaranteed to stay in the root until * initialization is finished. */ current->no_cgroup_migration = 1; } static inline void cgroup_kthread_ready(void) { /* * This kthread finished initialization. The creator should have * set PF_NO_SETAFFINITY if this kthread should stay in the root. */ current->no_cgroup_migration = 0; } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup *__cgroup_get_from_id(u64 id); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline void cgroup_lock(void) {} static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { return NULL; } static inline bool cgroup_psi_enabled(void) { return false; } static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { return true; } static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS /* * cgroup scalable recursive statistics. */ void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_flush(struct cgroup_subsys_state *css); /* * Basic resource stats. */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { struct cgroup *cgrp; cpuacct_charge(task, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup *cgrp; cpuacct_account_field(task, index, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); } #else /* CONFIG_CGROUPS */ static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) {} static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) {} #endif /* CONFIG_CGROUPS */ /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); void cgroup_leave_frozen(bool always_leave); void cgroup_update_frozen(struct cgroup *cgrp); void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; } #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } static inline bool cgroup_task_frozen(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUP_BPF static inline void cgroup_bpf_get(struct cgroup *cgrp) { percpu_ref_get(&cgrp->bpf.refcnt); } static inline void cgroup_bpf_put(struct cgroup *cgrp) { percpu_ref_put(&cgrp->bpf.refcnt); } #else /* CONFIG_CGROUP_BPF */ static inline void cgroup_bpf_get(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); struct cgroup_of_peak *of_peak(struct kernfs_open_file *of); #endif /* _LINUX_CGROUP_H */
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> */ #include "core_priv.h" /** * ib_device_register_rdmacg - register with rdma cgroup. * @device: device to register to participate in resource * accounting by rdma cgroup. * * Register with the rdma cgroup. Should be called before * exposing rdma device to user space applications to avoid * resource accounting leak. */ void ib_device_register_rdmacg(struct ib_device *device) { device->cg_device.name = device->name; rdmacg_register_device(&device->cg_device); } /** * ib_device_unregister_rdmacg - unregister with rdma cgroup. * @device: device to unregister. * * Unregister with the rdma cgroup. Should be called after * all the resources are deallocated, and after a stage when any * other resource allocation by user application cannot be done * for this device to avoid any leak in accounting. */ void ib_device_unregister_rdmacg(struct ib_device *device) { rdmacg_unregister_device(&device->cg_device); } int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) { return rdmacg_try_charge(&cg_obj->cg, &device->cg_device, resource_index); } EXPORT_SYMBOL(ib_rdmacg_try_charge); void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) { rdmacg_uncharge(cg_obj->cg, &device->cg_device, resource_index); } EXPORT_SYMBOL(ib_rdmacg_uncharge);
419 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_NULLS_H #define _LINUX_LIST_NULLS_H #include <linux/poison.h> #include <linux/const.h> /* * Special version of lists, where end of list is not a NULL pointer, * but a 'nulls' marker, which can have many different values. * (up to 2^31 different values guaranteed on all platforms) * * In the standard hlist, termination of a list is the NULL pointer. * In this special 'nulls' variant, we use the fact that objects stored in * a list are aligned on a word (4 or 8 bytes alignment). * We therefore use the last significant bit of 'ptr' : * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1) * Set to 0 : This is a pointer to some object (ptr) */ struct hlist_nulls_head { struct hlist_nulls_node *first; }; struct hlist_nulls_node { struct hlist_nulls_node *next, **pprev; }; #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) #define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_nulls_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ }) /** * ptr_is_a_nulls - Test if a ptr is a nulls * @ptr: ptr to be tested * */ static inline int is_a_nulls(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr & 1); } /** * get_nulls_value - Get the 'nulls' value of the end of chain * @ptr: end of chain * * Should be called only if is_a_nulls(ptr); */ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr) >> 1; } /** * hlist_nulls_unhashed - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. */ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) { return !h->pprev; } /** * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this * function may be used locklessly. */ static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) { return !READ_ONCE(h->pprev); } static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { return is_a_nulls(READ_ONCE(h->first)); } static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); h->first = n; if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } static inline void __hlist_nulls_del(struct hlist_nulls_node *n) { struct hlist_nulls_node *next = n->next; struct hlist_nulls_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (!is_a_nulls(next)) WRITE_ONCE(next->pprev, pprev); } static inline void hlist_nulls_del(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry_from(tpos, pos, member) \ for (; (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) #endif
33 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pkt_sched.h> #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 #define SRVL_FLOW_ON 0x80 #define SRVL_SET_PIN 0x82 #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfsrvl *service = container_obj(layr); if (layr->up == NULL || layr->up->ctrlcmd == NULL) return; switch (ctrl) { case CAIF_CTRLCMD_INIT_RSP: service->open = true; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case CAIF_CTRLCMD_DEINIT_RSP: case CAIF_CTRLCMD_INIT_FAIL_RSP: service->open = false; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: if (phyid != service->dev_info.id) break; if (service->modem_flow_on) layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); service->phy_flow_on = false; break; case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: if (phyid != service->dev_info.id) return; if (service->modem_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->phy_flow_on = true; break; case CAIF_CTRLCMD_FLOW_OFF_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); } service->modem_flow_on = false; break; case CAIF_CTRLCMD_FLOW_ON_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->modem_flow_on = true; break; case _CAIF_CTRLCMD_PHYIF_DOWN_IND: /* In case interface is down, let's fake a remove shutdown */ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; break; } } static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) { struct cfsrvl *service = container_obj(layr); caif_assert(layr != NULL); caif_assert(layr->dn != NULL); caif_assert(layr->dn->transmit != NULL); if (!service->supports_flowctrl) return 0; switch (ctrl) { case CAIF_MODEMCMD_FLOW_ON_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } case CAIF_MODEMCMD_FLOW_OFF_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } default: break; } return -EINVAL; } static void cfsrvl_release(struct cflayer *layer) { struct cfsrvl *service = container_of(layer, struct cfsrvl, layer); kfree(service); } void cfsrvl_init(struct cfsrvl *service, u8 channel_id, struct dev_info *dev_info, bool supports_flowctrl) { caif_assert(offsetof(struct cfsrvl, layer) == 0); service->open = false; service->modem_flow_on = true; service->phy_flow_on = true; service->layer.id = channel_id; service->layer.ctrlcmd = cfservl_ctrlcmd; service->layer.modemcmd = cfservl_modemcmd; service->dev_info = *dev_info; service->supports_flowctrl = supports_flowctrl; service->release = cfsrvl_release; } bool cfsrvl_ready(struct cfsrvl *service, int *err) { if (!service->open) { *err = -ENOTCONN; return false; } return true; } bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id == phyid; } void caif_free_client(struct cflayer *adap_layer) { struct cfsrvl *servl; if (adap_layer == NULL || adap_layer->dn == NULL) return; servl = container_obj(adap_layer->dn); servl->release(&servl->layer); } EXPORT_SYMBOL(caif_free_client); void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*hold)(struct cflayer *lyr), void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) return; service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } EXPORT_SYMBOL(caif_client_register_refcnt);
9 9 9 9 9 4 1 3 1 1 6 6 5 5 5 2 3 13 13 13 2 1 1 1 1 1 1 1 1 1 8 7 5 7 4 11 6 6 4 1 3 4 7 7 7 7 7 8 8 8 16 1 1 5 37 37 40 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 // SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: Mar 22 2001: Martin Bokaemper <mbokaemper@unispherenetworks.com> * - reset skb->pkt_type on incoming packets when MAC was changed * - see that changed MAC is saddr for outgoing packets * Oct 20, 2001: Ard van Breeman: * - Fix MC-list, finally. * - Flush MC-list on VLAN destroy. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/phy.h> #include <net/arp.h> #include <net/macsec.h> #include <net/netdev_lock.h> #include "vlan.h" #include "vlanproc.h" #include <linux/if_vlan.h> #include <linux/netpoll.h> /* * Create the VLAN header for an arbitrary protocol layer * * saddr=NULL means use device source address * daddr=NULL means leave destination address (eg unresolved arp) * * This is called when the SKB is moving down the stack towards the * physical devices. */ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_hdr *vhdr; unsigned int vhdrlen = 0; u16 vlan_tci = 0; int rc; if (!(vlan->flags & VLAN_FLAG_REORDER_HDR)) { vhdr = skb_push(skb, VLAN_HLEN); vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); vhdr->h_vlan_TCI = htons(vlan_tci); /* * Set the protocol type. For a packet of type ETH_P_802_3/2 we * put the length in here instead. */ if (type != ETH_P_802_3 && type != ETH_P_802_2) vhdr->h_vlan_encapsulated_proto = htons(type); else vhdr->h_vlan_encapsulated_proto = htons(len); skb->protocol = vlan->vlan_proto; type = ntohs(vlan->vlan_proto); vhdrlen = VLAN_HLEN; } /* Before delegating work to the lower layer, enter our MAC-address */ if (saddr == NULL) saddr = dev->dev_addr; /* Now make the underlying real hard header */ dev = vlan->real_dev; rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen); if (rc > 0) rc += vhdrlen; return rc; } static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); unsigned int len; int ret; /* Handle non-VLAN frames if they are sent to us, for example by DHCP. * * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ if (vlan->flags & VLAN_FLAG_REORDER_HDR || veth->h_vlan_proto != vlan->vlan_proto) { u16 vlan_tci; vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci); } skb->dev = vlan->real_dev; len = skb->len; if (unlikely(netpoll_tx_running(dev))) return vlan_netpoll_send_skb(vlan, skb); ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *stats; stats = this_cpu_ptr(vlan->vlan_pcpu_stats); u64_stats_update_begin(&stats->syncp); u64_stats_inc(&stats->tx_packets); u64_stats_add(&stats->tx_bytes, len); u64_stats_update_end(&stats->syncp); } else { this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped); } return ret; } static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; unsigned int max_mtu = real_dev->mtu; if (netif_reduces_vlan_mtu(real_dev)) max_mtu -= VLAN_HLEN; if (max_mtu < new_mtu) return -ERANGE; WRITE_ONCE(dev->mtu, new_mtu); return 0; } void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio) vlan->nr_ingress_mappings--; else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio) vlan->nr_ingress_mappings++; vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio; } int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_priority_tci_mapping *mp = NULL; struct vlan_priority_tci_mapping *np; u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; /* See if a priority mapping exists.. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; while (mp) { if (mp->priority == skb_prio) { if (mp->vlan_qos && !vlan_qos) vlan->nr_egress_mappings--; else if (!mp->vlan_qos && vlan_qos) vlan->nr_egress_mappings++; mp->vlan_qos = vlan_qos; return 0; } mp = mp->next; } /* Create a new mapping then. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL); if (!np) return -ENOBUFS; np->next = mp; np->priority = skb_prio; np->vlan_qos = vlan_qos; /* Before inserting this element in hash table, make sure all its fields * are committed to memory. * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask() */ smp_wmb(); vlan->egress_priority_map[skb_prio & 0xF] = np; if (vlan_qos) vlan->nr_egress_mappings++; return 0; } /* Flags are defined in the vlan_flags enum in * include/uapi/linux/if_vlan.h file. */ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); u32 old_flags = vlan->flags; if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | VLAN_FLAG_BRIDGE_BINDING)) return -EINVAL; vlan->flags = (old_flags & ~mask) | (flags & mask); if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_GVRP) { if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); else vlan_gvrp_request_leave(dev); } if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_MVRP) { if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); else vlan_mvrp_request_leave(dev); } return 0; } void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size) { strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size); } bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev) { if (dev->addr_assign_type != NET_ADDR_STOLEN) return false; eth_hw_addr_set(dev, real_dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return true; } static int vlan_dev_open(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; int err; if (!(real_dev->flags & IFF_UP) && !(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) return -ENETDOWN; if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) && !vlan_dev_inherit_address(dev, real_dev)) { err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) goto out; } ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); if (netif_carrier_ok(real_dev) && !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_on(dev); return 0; out: netif_carrier_off(dev); return err; } static int vlan_dev_stop(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_off(dev); return 0; } static int vlan_dev_set_mac_address(struct net_device *dev, void *p) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; struct sockaddr *addr = p; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (!(dev->flags & IFF_UP)) goto out; if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) { err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; } if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); out: eth_hw_addr_set(dev, addr->sa_data); return 0; } static int vlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return generic_hwtstamp_get_lower(real_dev, cfg); } static int vlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; if (!net_eq(dev_net(dev), dev_net(real_dev))) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; struct ifreq ifrr; int err = -EOPNOTSUPP; strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: err = dev_eth_ioctl(real_dev, &ifrr, cmd); break; } if (!err) ifr->ifr_ifru = ifrr.ifr_ifru; return err; } static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int err = 0; if (netif_device_present(real_dev) && ops->ndo_neigh_setup) err = ops->ndo_neigh_setup(real_dev, pa); return err; } #if IS_ENABLED(CONFIG_FCOE) static int vlan_dev_fcoe_ddp_setup(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = 0; if (ops->ndo_fcoe_ddp_setup) rc = ops->ndo_fcoe_ddp_setup(real_dev, xid, sgl, sgc); return rc; } static int vlan_dev_fcoe_ddp_done(struct net_device *dev, u16 xid) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int len = 0; if (ops->ndo_fcoe_ddp_done) len = ops->ndo_fcoe_ddp_done(real_dev, xid); return len; } static int vlan_dev_fcoe_enable(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_enable) rc = ops->ndo_fcoe_enable(real_dev); return rc; } static int vlan_dev_fcoe_disable(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_disable) rc = ops->ndo_fcoe_disable(real_dev); return rc; } static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = 0; if (ops->ndo_fcoe_ddp_target) rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc); return rc; } #endif #ifdef NETDEV_FCOE_WWNN static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_get_wwn) rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type); return rc; } #endif static void vlan_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; if (change & IFF_ALLMULTI) dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) { dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } static __be16 vlan_parse_protocol(const struct sk_buff *skb) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); return __vlan_get_protocol(skb, veth->h_vlan_proto, NULL); } static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, .parse_protocol = vlan_parse_protocol, }; static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; if (saddr == NULL) saddr = dev->dev_addr; return dev_hard_header(skb, real_dev, type, daddr, saddr, len); } static const struct header_ops vlan_passthru_header_ops = { .create = vlan_passthru_hard_header, .parse = eth_header_parse, .parse_protocol = vlan_parse_protocol, }; static const struct device_type vlan_type = { .name = "vlan", }; static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; netif_carrier_off(dev); /* IFF_BROADCAST|IFF_MULTICAST; ??? */ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_MASTER | IFF_SLAVE); dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING) dev->state |= (1 << __LINK_STATE_NOCARRIER); dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_FCOE_CRC | NETIF_F_FSO; if (real_dev->vlan_features & NETIF_F_HW_MACSEC) dev->hw_features |= NETIF_F_HW_MACSEC; dev->features |= dev->hw_features; dev->lltx = true; dev->fcoe_mtu = true; netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); dev->vlan_features = real_dev->vlan_features & ~(NETIF_F_FCOE_CRC | NETIF_F_FSO); dev->hw_enc_features = vlan_tnl_features(real_dev); dev->mpls_features = real_dev->mpls_features; /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; if (is_zero_ether_addr(dev->dev_addr)) { eth_hw_addr_set(dev, real_dev->dev_addr); dev->addr_assign_type = NET_ADDR_STOLEN; } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); #if IS_ENABLED(CONFIG_FCOE) dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid; #endif dev->needed_headroom = real_dev->needed_headroom; if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) { dev->header_ops = &vlan_passthru_header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { dev->header_ops = &vlan_header_ops; dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; } dev->netdev_ops = &vlan_netdev_ops; SET_NETDEV_DEVTYPE(dev, &vlan_type); netdev_lockdep_set_classes(dev); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; /* Get vlan's reference to real_dev */ netdev_hold(real_dev, &vlan->dev_tracker, GFP_KERNEL); return 0; } /* Note: this function might be called multiple times for the same device. */ void vlan_dev_free_egress_priority(const struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); int i; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { while ((pm = vlan->egress_priority_map[i]) != NULL) { vlan->egress_priority_map[i] = pm->next; kfree(pm); } } } static void vlan_dev_uninit(struct net_device *dev) { vlan_dev_free_egress_priority(dev); } static netdev_features_t vlan_dev_fix_features(struct net_device *dev, netdev_features_t features) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; netdev_features_t old_features = features; netdev_features_t lower_features; lower_features = netdev_intersect_features((real_dev->vlan_features | NETIF_F_RXCSUM), real_dev->features); /* Add HW_CSUM setting to preserve user ability to control * checksum offload on the vlan device. */ if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) lower_features |= NETIF_F_HW_CSUM; features = netdev_intersect_features(features, lower_features); features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE); return features; } static int vlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return __ethtool_get_link_ksettings(vlan->real_dev, cmd); } static void vlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, vlan_fullname, sizeof(info->driver)); strscpy(info->version, vlan_version, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); } static int vlan_ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return ethtool_get_ts_info_by_layer(vlan->real_dev, info); } static void vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct vlan_pcpu_stats *p; u32 rx_errors = 0, tx_dropped = 0; int i; for_each_possible_cpu(i) { u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes; unsigned int start; p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rxpackets = u64_stats_read(&p->rx_packets); rxbytes = u64_stats_read(&p->rx_bytes); rxmulticast = u64_stats_read(&p->rx_multicast); txpackets = u64_stats_read(&p->tx_packets); txbytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rxpackets; stats->rx_bytes += rxbytes; stats->multicast += rxmulticast; stats->tx_packets += txpackets; stats->tx_bytes += txbytes; /* rx_errors & tx_dropped are u32 */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->tx_dropped = tx_dropped; } #ifdef CONFIG_NET_POLL_CONTROLLER static void vlan_dev_poll_controller(struct net_device *dev) { return; } static int vlan_dev_netpoll_setup(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void vlan_dev_netpoll_cleanup(struct net_device *dev) { struct vlan_dev_priv *vlan= vlan_dev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int vlan_dev_get_iflink(const struct net_device *dev) { const struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return READ_ONCE(real_dev->ifindex); } static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev); path->type = DEV_PATH_VLAN; path->encap.id = vlan->vlan_id; path->encap.proto = vlan->vlan_proto; path->dev = ctx->dev; ctx->dev = vlan->real_dev; if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) return -ENOSPC; ctx->vlan[ctx->num_vlans].id = vlan->vlan_id; ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto; ctx->num_vlans++; return 0; } #if IS_ENABLED(CONFIG_MACSEC) static const struct macsec_ops *vlan_get_macsec_ops(const struct macsec_context *ctx) { return vlan_dev_priv(ctx->netdev)->real_dev->macsec_ops; } static int vlan_macsec_offload(int (* const func)(struct macsec_context *), struct macsec_context *ctx) { if (unlikely(!func)) return 0; return (*func)(ctx); } static int vlan_macsec_dev_open(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_dev_open, ctx); } static int vlan_macsec_dev_stop(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_dev_stop, ctx); } static int vlan_macsec_add_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_secy, ctx); } static int vlan_macsec_upd_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_secy, ctx); } static int vlan_macsec_del_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_secy, ctx); } static int vlan_macsec_add_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_rxsc, ctx); } static int vlan_macsec_upd_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_rxsc, ctx); } static int vlan_macsec_del_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_rxsc, ctx); } static int vlan_macsec_add_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_rxsa, ctx); } static int vlan_macsec_upd_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_rxsa, ctx); } static int vlan_macsec_del_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_rxsa, ctx); } static int vlan_macsec_add_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_txsa, ctx); } static int vlan_macsec_upd_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_txsa, ctx); } static int vlan_macsec_del_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_txsa, ctx); } static int vlan_macsec_get_dev_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_dev_stats, ctx); } static int vlan_macsec_get_tx_sc_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_tx_sc_stats, ctx); } static int vlan_macsec_get_tx_sa_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_tx_sa_stats, ctx); } static int vlan_macsec_get_rx_sc_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_rx_sc_stats, ctx); } static int vlan_macsec_get_rx_sa_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_rx_sa_stats, ctx); } static const struct macsec_ops macsec_offload_ops = { /* Device wide */ .mdo_dev_open = vlan_macsec_dev_open, .mdo_dev_stop = vlan_macsec_dev_stop, /* SecY */ .mdo_add_secy = vlan_macsec_add_secy, .mdo_upd_secy = vlan_macsec_upd_secy, .mdo_del_secy = vlan_macsec_del_secy, /* Security channels */ .mdo_add_rxsc = vlan_macsec_add_rxsc, .mdo_upd_rxsc = vlan_macsec_upd_rxsc, .mdo_del_rxsc = vlan_macsec_del_rxsc, /* Security associations */ .mdo_add_rxsa = vlan_macsec_add_rxsa, .mdo_upd_rxsa = vlan_macsec_upd_rxsa, .mdo_del_rxsa = vlan_macsec_del_rxsa, .mdo_add_txsa = vlan_macsec_add_txsa, .mdo_upd_txsa = vlan_macsec_upd_txsa, .mdo_del_txsa = vlan_macsec_del_txsa, /* Statistics */ .mdo_get_dev_stats = vlan_macsec_get_dev_stats, .mdo_get_tx_sc_stats = vlan_macsec_get_tx_sc_stats, .mdo_get_tx_sa_stats = vlan_macsec_get_tx_sa_stats, .mdo_get_rx_sc_stats = vlan_macsec_get_rx_sc_stats, .mdo_get_rx_sa_stats = vlan_macsec_get_rx_sa_stats, }; #endif static const struct ethtool_ops vlan_ethtool_ops = { .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_ts_info = vlan_ethtool_get_ts_info, }; static const struct net_device_ops vlan_netdev_ops = { .ndo_change_mtu = vlan_dev_change_mtu, .ndo_init = vlan_dev_init, .ndo_uninit = vlan_dev_uninit, .ndo_open = vlan_dev_open, .ndo_stop = vlan_dev_stop, .ndo_start_xmit = vlan_dev_hard_start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = vlan_dev_set_mac_address, .ndo_set_rx_mode = vlan_dev_set_rx_mode, .ndo_change_rx_flags = vlan_dev_change_rx_flags, .ndo_eth_ioctl = vlan_dev_ioctl, .ndo_neigh_setup = vlan_dev_neigh_setup, .ndo_get_stats64 = vlan_dev_get_stats64, #if IS_ENABLED(CONFIG_FCOE) .ndo_fcoe_ddp_setup = vlan_dev_fcoe_ddp_setup, .ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done, .ndo_fcoe_enable = vlan_dev_fcoe_enable, .ndo_fcoe_disable = vlan_dev_fcoe_disable, .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target, #endif #ifdef NETDEV_FCOE_WWNN .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn, #endif #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = vlan_dev_poll_controller, .ndo_netpoll_setup = vlan_dev_netpoll_setup, .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, .ndo_fill_forward_path = vlan_dev_fill_forward_path, .ndo_hwtstamp_get = vlan_hwtstamp_get, .ndo_hwtstamp_set = vlan_hwtstamp_set, }; static void vlan_dev_free(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; /* Get rid of the vlan's reference to real_dev */ netdev_put(vlan->real_dev, &vlan->dev_tracker); } void vlan_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->netdev_ops = &vlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = vlan_dev_free; dev->ethtool_ops = &vlan_ethtool_ops; #if IS_ENABLED(CONFIG_MACSEC) dev->macsec_ops = &macsec_offload_ops; #endif dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; eth_zero_addr(dev->broadcast); }
43 43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 /* SPDX-License-Identifier: GPL-2.0 * * IO cost model based controller. * * Copyright (C) 2019 Tejun Heo <tj@kernel.org> * Copyright (C) 2019 Andy Newell <newella@fb.com> * Copyright (C) 2019 Facebook * * One challenge of controlling IO resources is the lack of trivially * observable cost metric. This is distinguished from CPU and memory where * wallclock time and the number of bytes can serve as accurate enough * approximations. * * Bandwidth and iops are the most commonly used metrics for IO devices but * depending on the type and specifics of the device, different IO patterns * easily lead to multiple orders of magnitude variations rendering them * useless for the purpose of IO capacity distribution. While on-device * time, with a lot of clutches, could serve as a useful approximation for * non-queued rotational devices, this is no longer viable with modern * devices, even the rotational ones. * * While there is no cost metric we can trivially observe, it isn't a * complete mystery. For example, on a rotational device, seek cost * dominates while a contiguous transfer contributes a smaller amount * proportional to the size. If we can characterize at least the relative * costs of these different types of IOs, it should be possible to * implement a reasonable work-conserving proportional IO resource * distribution. * * 1. IO Cost Model * * IO cost model estimates the cost of an IO given its basic parameters and * history (e.g. the end sector of the last IO). The cost is measured in * device time. If a given IO is estimated to cost 10ms, the device should * be able to process ~100 of those IOs in a second. * * Currently, there's only one builtin cost model - linear. Each IO is * classified as sequential or random and given a base cost accordingly. * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate * device-specific coefficients. * * 2. Control Strategy * * The device virtual time (vtime) is used as the primary control metric. * The control strategy is composed of the following three parts. * * 2-1. Vtime Distribution * * When a cgroup becomes active in terms of IOs, its hierarchical share is * calculated. Please consider the following hierarchy where the numbers * inside parentheses denote the configured weights. * * root * / \ * A (w:100) B (w:300) * / \ * A0 (w:100) A1 (w:100) * * If B is idle and only A0 and A1 are actively issuing IOs, as the two are * of equal weight, each gets 50% share. If then B starts issuing IOs, B * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) * against the device vtime - an IO which takes 10ms on the underlying * device is considered to take 80ms on A0. * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * * 2-2. Vrate Adjustment * * It's unrealistic to expect the cost model to be perfect. There are too * many devices and even on the same device the overall performance * fluctuates depending on numerous factors such as IO mixture and device * internal garbage collection. The controller needs to adapt dynamically. * * This is achieved by adjusting the overall IO rate according to how busy * the device is. If the device becomes overloaded, we're sending down too * many IOs and should generally slow down. If there are waiting issuers * but the device isn't saturated, we're issuing too few and should * generally speed up. * * To slow down, we lower the vrate - the rate at which the device vtime * passes compared to the wall clock. For example, if the vtime is running * at the vrate of 75%, all cgroups added up would only be able to issue * 750ms worth of IOs per second, and vice-versa for speeding up. * * Device business is determined using two criteria - rq wait and * completion latencies. * * When a device gets saturated, the on-device and then the request queues * fill up and a bio which is ready to be issued has to wait for a request * to become available. When this delay becomes noticeable, it's a clear * indication that the device is saturated and we lower the vrate. This * saturation signal is fairly conservative as it only triggers when both * hardware and software queues are filled up, and is used as the default * busy signal. * * As devices can have deep queues and be unfair in how the queued commands * are executed, solely depending on rq wait may not result in satisfactory * control quality. For a better control quality, completion latency QoS * parameters can be configured so that the device is considered saturated * if N'th percentile completion latency rises above the set point. * * The completion latency requirements are a function of both the * underlying device characteristics and the desired IO latency quality of * service. There is an inherent trade-off - the tighter the latency QoS, * the higher the bandwidth lossage. Latency QoS is disabled by default * and can be set through /sys/fs/cgroup/io.cost.qos. * * 2-3. Work Conservation * * Imagine two cgroups A and B with equal weights. A is issuing a small IO * periodically while B is sending out enough parallel IOs to saturate the * device on its own. Let's say A's usage amounts to 100ms worth of IO * cost per second, i.e., 10% of the device capacity. The naive * distribution of half and half would lead to 60% utilization of the * device, a significant reduction in the total amount of work done * compared to free-for-all competition. This is too high a cost to pay * for IO control. * * To conserve the total amount of work done, we keep track of how much * each active cgroup is actually using and yield part of its weight if * there are other cgroups which can make use of it. In the above case, * A's weight will be lowered so that it hovers above the actual usage and * B would be able to use the rest. * * As we don't want to penalize a cgroup for donating its weight, the * surplus weight adjustment factors in a margin and has an immediate * snapback mechanism in case the cgroup needs more IO vtime for itself. * * Note that adjusting down surplus weights has the same effects as * accelerating vtime for other cgroups and work conservation can also be * implemented by adjusting vrate dynamically. However, squaring who can * donate and should take back how much requires hweight propagations * anyway making it easier to implement and understand as a separate * mechanism. * * 3. Monitoring * * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077 * * - per : Timer period * - cur_per : Internal wall and device vtime clock * - vrate : Device virtual time rate against wall clock * - weight : Surplus-adjusted and configured weights * - hweight : Surplus-adjusted and configured hierarchical weights * - inflt : The percentage of in-flight IO cost at the end of last period * - del_ms : Deferred issuer delay induction level and duration * - usages : Usage history */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/time64.h> #include <linux/parser.h> #include <linux/sched/signal.h> #include <asm/local.h> #include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" #include "blk-cgroup.h" #ifdef CONFIG_TRACEPOINTS /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */ #define TRACE_IOCG_PATH_LEN 1024 static DEFINE_SPINLOCK(trace_iocg_path_lock); static char trace_iocg_path[TRACE_IOCG_PATH_LEN]; #define TRACE_IOCG_PATH(type, iocg, ...) \ do { \ unsigned long flags; \ if (trace_iocost_##type##_enabled()) { \ spin_lock_irqsave(&trace_iocg_path_lock, flags); \ cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \ trace_iocg_path, TRACE_IOCG_PATH_LEN); \ trace_iocost_##type(iocg, trace_iocg_path, \ ##__VA_ARGS__); \ spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \ } \ } while (0) #else /* CONFIG_TRACE_POINTS */ #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0) #endif /* CONFIG_TRACE_POINTS */ enum { MILLION = 1000000, /* timer period is calculated from latency requirements, bound it */ MIN_PERIOD = USEC_PER_MSEC, MAX_PERIOD = USEC_PER_SEC, /* * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ MARGIN_MIN_PCT = 10, MARGIN_LOW_PCT = 20, MARGIN_TARGET_PCT = 50, INUSE_ADJ_STEP_PCT = 25, /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, }; enum { /* * As vtime is used to calculate the cost of each IO, it needs to * be fairly high precision. For example, it should be able to * represent the cost of a single page worth of discard with * suffificient accuracy. At the same time, it should be able to * represent reasonably long enough durations to be useful and * convenient during operation. * * 1s worth of vtime is 2^37. This gives us both sub-nanosecond * granularity and days of wrap-around time even at extreme vrates. */ VTIME_PER_SEC_SHIFT = 37, VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC, /* bound vrate adjustments within two orders of magnitude */ VRATE_MIN_PPM = 10000, /* 1% */ VRATE_MAX_PPM = 100000000, /* 10000% */ VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, VRATE_CLAMP_ADJ_PCT = 4, /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, }; enum { /* if IOs end up waiting for requests, issue less */ RQ_WAIT_BUSY_PCT = 5, /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, /* * The effect of delay is indirect and non-linear and a huge amount of * future debt can accumulate abruptly while unthrottled. Linearly scale * up delay as debt is going up and then let it decay exponentially. * This gives us quick ramp ups while delay is accumulating and long * tails which can help reducing the frequency of debt explosions on * unthrottle. The parameters are experimentally determined. * * The delay mechanism provides adequate protection and behavior in many * cases. However, this is far from ideal and falls shorts on both * fronts. The debtors are often throttled too harshly costing a * significant level of fairness and possibly total work while the * protection against their impacts on the system can be choppy and * unreliable. * * The shortcoming primarily stems from the fact that, unlike for page * cache, the kernel doesn't have well-defined back-pressure propagation * mechanism and policies for anonymous memory. Fully addressing this * issue will likely require substantial improvements in the area. */ MIN_DELAY_THR_PCT = 500, MAX_DELAY_THR_PCT = 25000, MIN_DELAY = 250, MAX_DELAY = 250 * USEC_PER_MSEC, /* halve debts if avg usage over 100ms is under 50% */ DFGV_USAGE_PCT = 50, DFGV_PERIOD = 100 * USEC_PER_MSEC, /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, /* * Count IO size in 4k pages. The 12bit shift helps keeping * size-proportional components of cost calculation in closer * numbers of digits to per-IO cost components. */ IOC_PAGE_SHIFT = 12, IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT, IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT, /* if apart further than 16M, consider randio for linear model */ LCOEF_RANDIO_PAGES = 4096, }; enum ioc_running { IOC_IDLE, IOC_RUNNING, IOC_STOP, }; /* io.cost.qos controls including per-dev enable of the whole controller */ enum { QOS_ENABLE, QOS_CTRL, NR_QOS_CTRL_PARAMS, }; /* io.cost.qos params */ enum { QOS_RPPM, QOS_RLAT, QOS_WPPM, QOS_WLAT, QOS_MIN, QOS_MAX, NR_QOS_PARAMS, }; /* io.cost.model controls */ enum { COST_CTRL, COST_MODEL, NR_COST_CTRL_PARAMS, }; /* builtin linear cost model coefficients */ enum { I_LCOEF_RBPS, I_LCOEF_RSEQIOPS, I_LCOEF_RRANDIOPS, I_LCOEF_WBPS, I_LCOEF_WSEQIOPS, I_LCOEF_WRANDIOPS, NR_I_LCOEFS, }; enum { LCOEF_RPAGE, LCOEF_RSEQIO, LCOEF_RRANDIO, LCOEF_WPAGE, LCOEF_WSEQIO, LCOEF_WRANDIO, NR_LCOEFS, }; enum { AUTOP_INVALID, AUTOP_HDD, AUTOP_SSD_QD1, AUTOP_SSD_DFL, AUTOP_SSD_FAST, }; struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; u64 lcoefs[NR_LCOEFS]; u32 too_fast_vrate_pct; u32 too_slow_vrate_pct; }; struct ioc_margins { s64 min; s64 low; s64 target; }; struct ioc_missed { local_t nr_met; local_t nr_missed; u32 last_met; u32 last_missed; }; struct ioc_pcpu_stat { struct ioc_missed missed[2]; local64_t rq_wait_ns; u64 last_rq_wait_ns; }; /* per device */ struct ioc { struct rq_qos rqos; bool enabled; struct ioc_params params; struct ioc_margins margins; u32 period_us; u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; spinlock_t lock; struct timer_list timer; struct list_head active_iocgs; /* active cgroups */ struct ioc_pcpu_stat __percpu *pcpu_stat; enum ioc_running running; atomic64_t vtime_rate; u64 vtime_base_rate; s64 vtime_err; seqcount_spinlock_t period_seqcount; u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ /* debt forgivness */ u64 dfgv_period_at; u64 dfgv_period_rem; u64 dfgv_usage_us_sum; u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; bool user_qos_params:1; bool user_cost_model:1; }; struct iocg_pcpu_stat { local64_t abs_vusage; }; struct iocg_stat { u64 usage_us; u64 wait_us; u64 indebt_us; u64 indelay_us; }; /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; struct ioc *ioc; /* * A iocg can get its weight from two sources - an explicit * per-device-cgroup configuration or the default weight of the * cgroup. `cfg_weight` is the explicit per-device-cgroup * configuration. `weight` is the effective considering both * sources. * * When an idle cgroup becomes active its `active` goes from 0 to * `weight`. `inuse` is the surplus adjusted active weight. * `active` and `inuse` are used to calculate `hweight_active` and * `hweight_inuse`. * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. * * `inuse` may be adjusted dynamically during period. `saved_*` are used * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; u32 last_inuse; s64 saved_margin; sector_t cursor; /* to detect randio */ /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; /* current delay in effect and when it started */ u64 delay; u64 delay_at; /* * The period this iocg was last active in. Used for deactivation * and invalidating `vtime`. */ atomic64_t active_period; struct list_head active_list; /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; u32 hweight_donating; u32 hweight_after_donation; struct list_head walk_list; struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; /* timestamp at the latest activation */ u64 activated_at; /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; u64 wait_since; u64 indebt_since; u64 indelay_since; /* this iocg's depth in the hierarchy and ancestors including self */ int level; struct ioc_gq *ancestors[]; }; /* per cgroup */ struct ioc_cgrp { struct blkcg_policy_data cpd; unsigned int dfl_weight; }; struct ioc_now { u64 now_ns; u64 now; u64 vnow; }; struct iocg_wait { struct wait_queue_entry wait; struct bio *bio; u64 abs_cost; bool committed; }; struct iocg_wake_ctx { struct ioc_gq *iocg; u32 hw_inuse; s64 vbudget; }; static const struct ioc_params autop[] = { [AUTOP_HDD] = { .qos = { [QOS_RLAT] = 250000, /* 250ms */ [QOS_WLAT] = 250000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 174019176, [I_LCOEF_RSEQIOPS] = 41708, [I_LCOEF_RRANDIOPS] = 370, [I_LCOEF_WBPS] = 178075866, [I_LCOEF_WSEQIOPS] = 42705, [I_LCOEF_WRANDIOPS] = 378, }, }, [AUTOP_SSD_QD1] = { .qos = { [QOS_RLAT] = 25000, /* 25ms */ [QOS_WLAT] = 25000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 245855193, [I_LCOEF_RSEQIOPS] = 61575, [I_LCOEF_RRANDIOPS] = 6946, [I_LCOEF_WBPS] = 141365009, [I_LCOEF_WSEQIOPS] = 33716, [I_LCOEF_WRANDIOPS] = 26796, }, }, [AUTOP_SSD_DFL] = { .qos = { [QOS_RLAT] = 25000, /* 25ms */ [QOS_WLAT] = 25000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 488636629, [I_LCOEF_RSEQIOPS] = 8932, [I_LCOEF_RRANDIOPS] = 8518, [I_LCOEF_WBPS] = 427891549, [I_LCOEF_WSEQIOPS] = 28755, [I_LCOEF_WRANDIOPS] = 21940, }, .too_fast_vrate_pct = 500, }, [AUTOP_SSD_FAST] = { .qos = { [QOS_RLAT] = 5000, /* 5ms */ [QOS_WLAT] = 5000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 3102524156LLU, [I_LCOEF_RSEQIOPS] = 724816, [I_LCOEF_RRANDIOPS] = 778122, [I_LCOEF_WBPS] = 1742780862LLU, [I_LCOEF_WSEQIOPS] = 425702, [I_LCOEF_WRANDIOPS] = 443193, }, .too_slow_vrate_pct = 10, }, }; /* * vrate adjust percentages indexed by ioc->busy_level. We adjust up on * vtime credit shortage and down on device saturation. */ static const u32 vrate_adj_pct[] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 }; static struct blkcg_policy blkcg_policy_iocost; /* accessors and helpers */ static struct ioc *rqos_to_ioc(struct rq_qos *rqos) { return container_of(rqos, struct ioc, rqos); } static struct ioc *q_to_ioc(struct request_queue *q) { return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } static const char __maybe_unused *ioc_name(struct ioc *ioc) { struct gendisk *disk = ioc->rqos.disk; if (!disk) return "<unknown>"; return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct ioc_gq, pd) : NULL; } static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg) { return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost)); } static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg) { return pd_to_blkg(&iocg->pd); } static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost), struct ioc_cgrp, cpd); } /* * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical * weight, the more expensive each IO. Must round up. */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* * The inverse of abs_cost_to_cost(). Must round up. */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 abs_cost, u64 cost) { struct iocg_pcpu_stat *gcs; bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); gcs = get_cpu_ptr(iocg->pcpu_stat); local64_add(abs_cost, &gcs->abs_vusage); put_cpu_ptr(gcs); } static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) { if (lock_ioc) { spin_lock_irqsave(&iocg->ioc->lock, *flags); spin_lock(&iocg->waitq.lock); } else { spin_lock_irqsave(&iocg->waitq.lock, *flags); } } static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) { if (unlock_ioc) { spin_unlock(&iocg->waitq.lock); spin_unlock_irqrestore(&iocg->ioc->lock, *flags); } else { spin_unlock_irqrestore(&iocg->waitq.lock, *flags); } } #define CREATE_TRACE_POINTS #include <trace/events/iocost.h> static void ioc_refresh_margins(struct ioc *ioc) { struct ioc_margins *margins = &ioc->margins; u32 period_us = ioc->period_us; u64 vrate = ioc->vtime_base_rate; margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; } /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { u32 ppm, lat, multi, period_us; lockdep_assert_held(&ioc->lock); /* pick the higher latency target */ if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { ppm = ioc->params.qos[QOS_RPPM]; lat = ioc->params.qos[QOS_RLAT]; } else { ppm = ioc->params.qos[QOS_WPPM]; lat = ioc->params.qos[QOS_WLAT]; } /* * We want the period to be long enough to contain a healthy number * of IOs while short enough for granular control. Define it as a * multiple of the latency target. Ideally, the multiplier should * be scaled according to the percentile so that it would nominally * contain a certain number of requests. Let's be simpler and * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50). */ if (ppm) multi = max_t(u32, (MILLION - ppm) / 50000, 2); else multi = 2; period_us = multi * lat; period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD); /* calculate dependent params */ ioc->period_us = period_us; ioc->timer_slack_ns = div64_u64( (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, 100); ioc_refresh_margins(ioc); } /* * ioc->rqos.disk isn't initialized when this function is called from * the init path. */ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) { int idx = ioc->autop_idx; const struct ioc_params *p = &autop[idx]; u32 vrate_pct; u64 now_ns; /* rotational? */ if (!blk_queue_nonrot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ if (blk_queue_depth(disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ if (idx < AUTOP_SSD_DFL) return AUTOP_SSD_DFL; /* if user is overriding anything, maintain what was there */ if (ioc->user_qos_params || ioc->user_cost_model) return idx; /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) ioc->autop_too_fast_at = now_ns; if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) return idx + 1; } else { ioc->autop_too_fast_at = 0; } if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { if (!ioc->autop_too_slow_at) ioc->autop_too_slow_at = now_ns; if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) return idx - 1; } else { ioc->autop_too_slow_at = 0; } return idx; } /* * Take the followings as input * * @bps maximum sequential throughput * @seqiops maximum sequential 4k iops * @randiops maximum random 4k iops * * and calculate the linear model cost coefficients. * * *@page per-page cost 1s / (@bps / 4096) * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0) * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0) */ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, u64 *page, u64 *seqio, u64 *randio) { u64 v; *page = *seqio = *randio = 0; if (bps) { u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); if (bps_pages) *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); else *page = 1; } if (seqiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); if (v > *page) *seqio = v - *page; } if (randiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops); if (v > *page) *randio = v - *page; } } static void ioc_refresh_lcoefs(struct ioc *ioc) { u64 *u = ioc->params.i_lcoefs; u64 *c = ioc->params.lcoefs; calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]); calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS], &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); } /* * struct gendisk is required as an argument because ioc->rqos.disk * is not properly initialized when called from the init path. */ static bool ioc_refresh_params_disk(struct ioc *ioc, bool force, struct gendisk *disk) { const struct ioc_params *p; int idx; lockdep_assert_held(&ioc->lock); idx = ioc_autop_idx(ioc, disk); p = &autop[idx]; if (idx == ioc->autop_idx && !force) return false; if (idx != ioc->autop_idx) { atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); ioc->vtime_base_rate = VTIME_PER_USEC; } ioc->autop_idx = idx; ioc->autop_too_fast_at = 0; ioc->autop_too_slow_at = 0; if (!ioc->user_qos_params) memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); if (!ioc->user_cost_model) memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); ioc_refresh_period_us(ioc); ioc_refresh_lcoefs(ioc); ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * VTIME_PER_USEC, MILLION); ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * VTIME_PER_USEC, MILLION); return true; } static bool ioc_refresh_params(struct ioc *ioc, bool force) { return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); } /* * When an iocg accumulates too much vtime or gets deactivated, we throw away * some vtime, which lowers the overall device utilization. As the exact amount * which is being thrown away is known, we can compensate by accelerating the * vrate accordingly so that the extra vtime generated in the current period * matches what got lost. */ static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) { s64 pleft = ioc->period_at + ioc->period_us - now->now; s64 vperiod = ioc->period_us * ioc->vtime_base_rate; s64 vcomp, vcomp_min, vcomp_max; lockdep_assert_held(&ioc->lock); /* we need some time left in this period */ if (pleft <= 0) goto done; /* * Calculate how much vrate should be adjusted to offset the error. * Limit the amount of adjustment and deduct the adjusted amount from * the error. */ vcomp = -div64_s64(ioc->vtime_err, pleft); vcomp_min = -(ioc->vtime_base_rate >> 1); vcomp_max = ioc->vtime_base_rate; vcomp = clamp(vcomp, vcomp_min, vcomp_max); ioc->vtime_err += vcomp * pleft; atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); done: /* bound how much error can accumulate */ ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, int nr_lagging, int nr_shortages, int prev_busy_level, u32 *missed_ppm) { u64 vrate = ioc->vtime_base_rate; u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { if (ioc->busy_level != prev_busy_level || nr_lagging) trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); return; } /* * If vrate is out of bounds, apply clamp gradually as the * bounds can change abruptly. Otherwise, apply busy_level * based adjustment. */ if (vrate < vrate_min) { vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); vrate = min(vrate, vrate_min); } else if (vrate > vrate_max) { vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); vrate = max(vrate, vrate_max); } else { int idx = min_t(int, abs(ioc->busy_level), ARRAY_SIZE(vrate_adj_pct) - 1); u32 adj_pct = vrate_adj_pct[idx]; if (ioc->busy_level > 0) adj_pct = 100 - adj_pct; else adj_pct = 100 + adj_pct; vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), vrate_min, vrate_max); } trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); ioc->vtime_base_rate = vrate; ioc_refresh_margins(ioc); } /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { unsigned seq; u64 vrate; now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate); /* * The current vtime is * * vtime at period start + (wallclock time since the start) * vrate * * As a consistent snapshot of `period_at_vtime` and `period_at` is * needed, they're seqcount protected. */ do { seq = read_seqcount_begin(&ioc->period_seqcount); now->vnow = ioc->period_at_vtime + (now->now - ioc->period_at) * vrate; } while (read_seqcount_retry(&ioc->period_seqcount, seq)); } static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) { WARN_ON_ONCE(ioc->running != IOC_RUNNING); write_seqcount_begin(&ioc->period_seqcount); ioc->period_at = now->now; ioc->period_at_vtime = now->vnow; write_seqcount_end(&ioc->period_seqcount); ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); add_timer(&ioc->timer); } /* * Update @iocg's `active` and `inuse` to @active and @inuse, update level * weight sums and propagate upwards accordingly. If @save, the current margin * is saved to be used as reference for later inuse in-period adjustments. */ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, bool save, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; int lvl; lockdep_assert_held(&ioc->lock); /* * For an active leaf node, its inuse shouldn't be zero or exceed * @active. An active internal node's inuse is solely determined by the * inuse to active ratio of its children regardless of @inuse. */ if (list_empty(&iocg->active_list) && iocg->child_active_sum) { inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, iocg->child_active_sum); } else { /* * It may be tempting to turn this into a clamp expression with * a lower limit of 1 but active may be 0, which cannot be used * as an upper limit in that situation. This expression allows * active to clamp inuse unless it is 0, in which case inuse * becomes 1. */ inuse = min(inuse, active) ?: 1; } iocg->last_inuse = iocg->inuse; if (save) iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); if (active == iocg->active && inuse == iocg->inuse) return; for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; u32 parent_active = 0, parent_inuse = 0; /* update the level sums */ parent->child_active_sum += (s32)(active - child->active); parent->child_inuse_sum += (s32)(inuse - child->inuse); /* apply the updates */ child->active = active; child->inuse = inuse; /* * The delta between inuse and active sums indicates that * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { parent_active = parent->weight; parent_inuse = DIV64_U64_ROUND_UP( parent_active * parent->child_inuse_sum, parent->child_active_sum); } /* do we need to keep walking up? */ if (parent_active == parent->active && parent_inuse == parent->inuse) break; active = parent_active; inuse = parent_inuse; } ioc->weights_updated = true; } static void commit_weights(struct ioc *ioc) { lockdep_assert_held(&ioc->lock); if (ioc->weights_updated) { /* paired with rmb in current_hweight(), see there */ smp_wmb(); atomic_inc(&ioc->hweight_gen); ioc->weights_updated = false; } } static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, bool save, struct ioc_now *now) { __propagate_weights(iocg, active, inuse, save, now); commit_weights(iocg->ioc); } static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) { struct ioc *ioc = iocg->ioc; int lvl; u32 hwa, hwi; int ioc_gen; /* hot path - if uptodate, use cached */ ioc_gen = atomic_read(&ioc->hweight_gen); if (ioc_gen == iocg->hweight_gen) goto out; /* * Paired with wmb in commit_weights(). If we saw the updated * hweight_gen, all the weight updates from __propagate_weights() are * visible too. * * We can race with weight updates during calculation and get it * wrong. However, hweight_gen would have changed and a future * reader will recalculate and we're guaranteed to discard the * wrong result soon. */ smp_rmb(); hwa = hwi = WEIGHT_ONE; for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; u64 active_sum = READ_ONCE(parent->child_active_sum); u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); u32 active = READ_ONCE(child->active); u32 inuse = READ_ONCE(child->inuse); /* we can race with deactivations and either may read as zero */ if (!active_sum || !inuse_sum) continue; active_sum = max_t(u64, active, active_sum); hwa = div64_u64((u64)hwa * active, active_sum); inuse_sum = max_t(u64, inuse, inuse_sum); hwi = div64_u64((u64)hwi * inuse, inuse_sum); } iocg->hweight_active = max_t(u32, hwa, 1); iocg->hweight_inuse = max_t(u32, hwi, 1); iocg->hweight_gen = ioc_gen; out: if (hw_activep) *hw_activep = iocg->hweight_active; if (hw_inusep) *hw_inusep = iocg->hweight_inuse; } /* * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the * other weights stay unchanged. */ static u32 current_hweight_max(struct ioc_gq *iocg) { u32 hwm = WEIGHT_ONE; u32 inuse = iocg->active; u64 child_inuse_sum; int lvl; lockdep_assert_held(&iocg->ioc->lock); for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, parent->child_active_sum); } return max_t(u32, hwm, 1); } static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); u32 weight; lockdep_assert_held(&ioc->lock); weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) propagate_weights(iocg, weight, iocg->inuse, true, now); iocg->weight = weight; } static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; /* * If seem to be already active, just update the stamp to tell the * timer that we're still active. We don't mind occassional races. */ if (!list_empty(&iocg->active_list)) { ioc_now(ioc, now); cur_period = atomic64_read(&ioc->cur_period); if (atomic64_read(&iocg->active_period) != cur_period) atomic64_set(&iocg->active_period, cur_period); return true; } /* racy check on internal node IOs, treat as root level IOs */ if (iocg->child_active_sum) return false; spin_lock_irq(&ioc->lock); ioc_now(ioc, now); /* update period */ cur_period = atomic64_read(&ioc->cur_period); last_period = atomic64_read(&iocg->active_period); atomic64_set(&iocg->active_period, cur_period); /* already activated or breaking leaf-only constraint? */ if (!list_empty(&iocg->active_list)) goto succeed_unlock; for (i = iocg->level - 1; i > 0; i--) if (!list_empty(&iocg->ancestors[i]->active_list)) goto fail_unlock; if (iocg->child_active_sum) goto fail_unlock; /* * Always start with the target budget. On deactivation, we throw away * anything above it. */ vtarget = now->vnow - ioc->margins.target; vtime = atomic64_read(&iocg->vtime); atomic64_add(vtarget - vtime, &iocg->vtime); atomic64_add(vtarget - vtime, &iocg->done_vtime); vtime = vtarget; /* * Activate, propagate weight and start period timer if not * running. Reset hweight_gen to avoid accidental match from * wrapping. */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); propagate_weights(iocg, iocg->weight, iocg->last_inuse ?: iocg->weight, true, now); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); iocg->activated_at = now->now; if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; ioc->dfgv_period_at = now->now; ioc->dfgv_period_rem = 0; ioc_start_period(ioc, now); } succeed_unlock: spin_unlock_irq(&ioc->lock); return true; fail_unlock: spin_unlock_irq(&ioc->lock); return false; } static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); /* * If the delay is set by another CPU, we may be in the past. No need to * change anything if so. This avoids decay calculation underflow. */ if (time_before64(now->now, iocg->delay_at)) return false; /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; shift = div64_u64(tdelta, USEC_PER_SEC); if (iocg->delay && shift < BITS_PER_LONG) delay = iocg->delay >> shift; else delay = 0; /* calculate the new delay from the debt amount */ current_hweight(iocg, &hwa, NULL); vover = atomic64_read(&iocg->vtime) + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; vover_pct = div64_s64(100 * vover, ioc->period_us * ioc->vtime_base_rate); if (vover_pct <= MIN_DELAY_THR_PCT) new_delay = 0; else if (vover_pct >= MAX_DELAY_THR_PCT) new_delay = MAX_DELAY; else new_delay = MIN_DELAY + div_u64((MAX_DELAY - MIN_DELAY) * (vover_pct - MIN_DELAY_THR_PCT), MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); /* pick the higher one and apply */ if (new_delay > delay) { iocg->delay = new_delay; iocg->delay_at = now->now; delay = new_delay; } if (delay >= MIN_DELAY) { if (!iocg->indelay_since) iocg->indelay_since = now->now; blkcg_set_delay(blkg, delay * NSEC_PER_USEC); return true; } else { if (iocg->indelay_since) { iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; blkcg_clear_delay(blkg); return false; } } static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, struct ioc_now *now) { struct iocg_pcpu_stat *gcs; lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); WARN_ON_ONCE(list_empty(&iocg->active_list)); /* * Once in debt, debt handling owns inuse. @iocg stays at the minimum * inuse donating all of it share to others until its debt is paid off. */ if (!iocg->abs_vdebt && abs_cost) { iocg->indebt_since = now->now; propagate_weights(iocg, iocg->active, 0, false, now); } iocg->abs_vdebt += abs_cost; gcs = get_cpu_ptr(iocg->pcpu_stat); local64_add(abs_cost, &gcs->abs_vusage); put_cpu_ptr(gcs); } static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, struct ioc_now *now) { lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); /* * make sure that nobody messed with @iocg. Check iocg->pd.online * to avoid warn when removing blkcg or disk. */ WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); WARN_ON_ONCE(iocg->inuse > 1); iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, false, now); } } static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; if (ctx->vbudget < 0) return -1; iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); wait->committed = true; /* * autoremove_wake_function() removes the wait entry only when it * actually changed the task state. We want the wait always removed. * Remove explicitly and use default_wake_function(). Note that the * order of operations is important as finish_wait() tests whether * @wq_entry is removed without grabbing the lock. */ default_wake_function(wq_entry, mode, flags, key); list_del_init_careful(&wq_entry->entry); return 0; } /* * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in * addition to iocg->waitq.lock. */ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 vshortage, expires, oexpires; s64 vbudget; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); current_hweight(iocg, &hwa, NULL); vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ if (pay_debt && iocg->abs_vdebt && vbudget > 0) { u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); u64 vpay = abs_cost_to_cost(abs_vpay, hwa); lockdep_assert_held(&ioc->lock); atomic64_add(vpay, &iocg->vtime); atomic64_add(vpay, &iocg->done_vtime); iocg_pay_debt(iocg, abs_vpay, now); vbudget -= vpay; } if (iocg->abs_vdebt || iocg->delay) iocg_kick_delay(iocg, now); /* * Debt can still be outstanding if we haven't paid all yet or the * caller raced and called without @pay_debt. Shouldn't wake up waiters * under debt. Make sure @vbudget reflects the outstanding amount and is * not positive. */ if (iocg->abs_vdebt) { s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); vbudget = min_t(s64, 0, vbudget - vdebt); } /* * Wake up the ones which are due and see how much vtime we'll need for * the next one. As paying off debt restores hw_inuse, it must be read * after the above debt payment. */ ctx.vbudget = vbudget; current_hweight(iocg, NULL, &ctx.hw_inuse); __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; } if (!iocg->wait_since) iocg->wait_since = now->now; if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * NSEC_PER_USEC; expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); if (hrtimer_is_queued(&iocg->waitq_timer) && abs(oexpires - expires) <= ioc->timer_slack_ns) return; hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), ioc->timer_slack_ns, HRTIMER_MODE_ABS); } static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); bool pay_debt = READ_ONCE(iocg->abs_vdebt); struct ioc_now now; unsigned long flags; ioc_now(iocg->ioc, &now); iocg_lock(iocg, pay_debt, &flags); iocg_kick_waitq(iocg, pay_debt, &now); iocg_unlock(iocg, pay_debt, &flags); return HRTIMER_NORESTART; } static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) { u32 nr_met[2] = { }; u32 nr_missed[2] = { }; u64 rq_wait_ns = 0; int cpu, rw; for_each_online_cpu(cpu) { struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); u64 this_rq_wait_ns; for (rw = READ; rw <= WRITE; rw++) { u32 this_met = local_read(&stat->missed[rw].nr_met); u32 this_missed = local_read(&stat->missed[rw].nr_missed); nr_met[rw] += this_met - stat->missed[rw].last_met; nr_missed[rw] += this_missed - stat->missed[rw].last_missed; stat->missed[rw].last_met = this_met; stat->missed[rw].last_missed = this_missed; } this_rq_wait_ns = local64_read(&stat->rq_wait_ns); rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; stat->last_rq_wait_ns = this_rq_wait_ns; } for (rw = READ; rw <= WRITE; rw++) { if (nr_met[rw] + nr_missed[rw]) missed_ppm_ar[rw] = DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION, nr_met[rw] + nr_missed[rw]); else missed_ppm_ar[rw] = 0; } *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, ioc->period_us * NSEC_PER_USEC); } /* was iocg idle this period? */ static bool iocg_is_idle(struct ioc_gq *iocg) { struct ioc *ioc = iocg->ioc; /* did something get issued this period? */ if (atomic64_read(&iocg->active_period) == atomic64_read(&ioc->cur_period)) return false; /* is something in flight? */ if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) return false; return true; } /* * Call this function on the target leaf @iocg's to build pre-order traversal * list of all the ancestors in @inner_walk. The inner nodes are linked through * ->walk_list and the caller is responsible for dissolving the list after use. */ static void iocg_build_inner_walk(struct ioc_gq *iocg, struct list_head *inner_walk) { int lvl; WARN_ON_ONCE(!list_empty(&iocg->walk_list)); /* find the first ancestor which hasn't been visited yet */ for (lvl = iocg->level - 1; lvl >= 0; lvl--) { if (!list_empty(&iocg->ancestors[lvl]->walk_list)) break; } /* walk down and visit the inner nodes to get pre-order traversal */ while (++lvl <= iocg->level - 1) { struct ioc_gq *inner = iocg->ancestors[lvl]; /* record traversal order */ list_add_tail(&inner->walk_list, inner_walk); } } /* propagate the deltas to the parent */ static void iocg_flush_stat_upward(struct ioc_gq *iocg) { if (iocg->level > 0) { struct iocg_stat *parent_stat = &iocg->ancestors[iocg->level - 1]->stat; parent_stat->usage_us += iocg->stat.usage_us - iocg->last_stat.usage_us; parent_stat->wait_us += iocg->stat.wait_us - iocg->last_stat.wait_us; parent_stat->indebt_us += iocg->stat.indebt_us - iocg->last_stat.indebt_us; parent_stat->indelay_us += iocg->stat.indelay_us - iocg->last_stat.indelay_us; } iocg->last_stat = iocg->stat; } /* collect per-cpu counters and propagate the deltas to the parent */ static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 abs_vusage = 0; u64 vusage_delta; int cpu; lockdep_assert_held(&iocg->ioc->lock); /* collect per-cpu counters */ for_each_possible_cpu(cpu) { abs_vusage += local64_read( per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); } vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); iocg->stat.usage_us += iocg->usage_delta_us; iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) { LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg; /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } /* * Determine what @iocg's hweight_inuse should be after donating unused * capacity. @hwm is the upper bound and used to signal no donation. This * function also throws away @iocg's excess budget. */ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, u32 usage, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 vtime = atomic64_read(&iocg->vtime); s64 excess, delta, target, new_hwi; /* debt handling owns inuse for debtors */ if (iocg->abs_vdebt) return 1; /* see whether minimum margin requirement is met */ if (waitqueue_active(&iocg->waitq) || time_after64(vtime, now->vnow - ioc->margins.min)) return hwm; /* throw away excess above target */ excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { atomic64_add(excess, &iocg->vtime); atomic64_add(excess, &iocg->done_vtime); vtime += excess; ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } /* * Let's say the distance between iocg's and device's vtimes as a * fraction of period duration is delta. Assuming that the iocg will * consume the usage determined above, we want to determine new_hwi so * that delta equals MARGIN_TARGET at the end of the next period. * * We need to execute usage worth of IOs while spending the sum of the * new budget (1 - MARGIN_TARGET) and the leftover from the last period * (delta): * * usage = (1 - MARGIN_TARGET + delta) * new_hwi * * Therefore, the new_hwi is: * * new_hwi = usage / (1 - MARGIN_TARGET + delta) */ delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), now->vnow - ioc->period_at_vtime); target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); return clamp_t(s64, new_hwi, 1, hwm); } /* * For work-conservation, an iocg which isn't using all of its share should * donate the leftover to other iocgs. There are two ways to achieve this - 1. * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. * * #1 is mathematically simpler but has the drawback of requiring synchronous * global hweight_inuse updates when idle iocg's get activated or inuse weights * change due to donation snapbacks as it has the possibility of grossly * overshooting what's allowed by the model and vrate. * * #2 is inherently safe with local operations. The donating iocg can easily * snap back to higher weights when needed without worrying about impacts on * other nodes as the impacts will be inherently correct. This also makes idle * iocg activations safe. The only effect activations have is decreasing * hweight_inuse of others, the right solution to which is for those iocgs to * snap back to higher weights. * * So, we go with #2. The challenge is calculating how each donating iocg's * inuse should be adjusted to achieve the target donation amounts. This is done * using Andy's method described in the following pdf. * * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo * * Given the weights and target after-donation hweight_inuse values, Andy's * method determines how the proportional distribution should look like at each * sibling level to maintain the relative relationship between all non-donating * pairs. To roughly summarize, it divides the tree into donating and * non-donating parts, calculates global donation rate which is used to * determine the target hweight_inuse for each node, and then derives per-level * proportions. * * The following pdf shows that global distribution calculated this way can be * achieved by scaling inuse weights of donating leaves and propagating the * adjustments upwards proportionally. * * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE * * Combining the above two, we can determine how each leaf iocg's inuse should * be adjusted to achieve the target donation. * * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN * * The inline comments use symbols from the last pdf. * * b is the sum of the absolute budgets in the subtree. 1 for the root node. * f is the sum of the absolute budgets of non-donating nodes in the subtree. * t is the sum of the absolute budgets of donating nodes in the subtree. * w is the weight of the node. w = w_f + w_t * w_f is the non-donating portion of w. w_f = w * f / b * w_b is the donating portion of w. w_t = w * t / b * s is the sum of all sibling weights. s = Sum(w) for siblings * s_f and s_t are the non-donating and donating portions of s. * * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. * w_pt is the donating portion of the parent's weight and w'_pt the same value * after adjustments. Subscript r denotes the root node's values. */ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) { LIST_HEAD(over_hwa); LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg, *root_iocg; u32 after_sum, over_sum, over_target, gamma; /* * It's pretty unlikely but possible for the total sum of * hweight_after_donation's to be higher than WEIGHT_ONE, which will * confuse the following calculations. If such condition is detected, * scale down everyone over its full share equally to keep the sum below * WEIGHT_ONE. */ after_sum = 0; over_sum = 0; list_for_each_entry(iocg, surpluses, surplus_list) { u32 hwa; current_hweight(iocg, &hwa, NULL); after_sum += iocg->hweight_after_donation; if (iocg->hweight_after_donation > hwa) { over_sum += iocg->hweight_after_donation; list_add(&iocg->walk_list, &over_hwa); } } if (after_sum >= WEIGHT_ONE) { /* * The delta should be deducted from the over_sum, calculate * target over_sum value. */ u32 over_delta = after_sum - (WEIGHT_ONE - 1); WARN_ON_ONCE(over_sum <= over_delta); over_target = over_sum - over_delta; } else { over_target = 0; } list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { if (over_target) iocg->hweight_after_donation = div_u64((u64)iocg->hweight_after_donation * over_target, over_sum); list_del_init(&iocg->walk_list); } /* * Build pre-order inner node walk list and prepare for donation * adjustment calculations. */ list_for_each_entry(iocg, surpluses, surplus_list) { iocg_build_inner_walk(iocg, &inner_walk); } root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); WARN_ON_ONCE(root_iocg->level > 0); list_for_each_entry(iocg, &inner_walk, walk_list) { iocg->child_adjusted_sum = 0; iocg->hweight_donating = 0; iocg->hweight_after_donation = 0; } /* * Propagate the donating budget (b_t) and after donation budget (b'_t) * up the hierarchy. */ list_for_each_entry(iocg, surpluses, surplus_list) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; parent->hweight_donating += iocg->hweight_donating; parent->hweight_after_donation += iocg->hweight_after_donation; } list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { if (iocg->level > 0) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; parent->hweight_donating += iocg->hweight_donating; parent->hweight_after_donation += iocg->hweight_after_donation; } } /* * Calculate inner hwa's (b) and make sure the donation values are * within the accepted ranges as we're doing low res calculations with * roundups. */ list_for_each_entry(iocg, &inner_walk, walk_list) { if (iocg->level) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; iocg->hweight_active = DIV64_U64_ROUND_UP( (u64)parent->hweight_active * iocg->active, parent->child_active_sum); } iocg->hweight_donating = min(iocg->hweight_donating, iocg->hweight_active); iocg->hweight_after_donation = min(iocg->hweight_after_donation, iocg->hweight_donating - 1); if (WARN_ON_ONCE(iocg->hweight_active <= 1 || iocg->hweight_donating <= 1 || iocg->hweight_after_donation == 0)) { pr_warn("iocg: invalid donation weights in "); pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); pr_cont(": active=%u donating=%u after=%u\n", iocg->hweight_active, iocg->hweight_donating, iocg->hweight_after_donation); } } /* * Calculate the global donation rate (gamma) - the rate to adjust * non-donating budgets by. * * No need to use 64bit multiplication here as the first operand is * guaranteed to be smaller than WEIGHT_ONE (1<<16). * * We know that there are beneficiary nodes and the sum of the donating * hweights can't be whole; however, due to the round-ups during hweight * calculations, root_iocg->hweight_donating might still end up equal to * or greater than whole. Limit the range when calculating the divider. * * gamma = (1 - t_r') / (1 - t_r) */ gamma = DIV_ROUND_UP( (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); /* * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner * nodes. */ list_for_each_entry(iocg, &inner_walk, walk_list) { struct ioc_gq *parent; u32 inuse, wpt, wptp; u64 st, sf; if (iocg->level == 0) { /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), WEIGHT_ONE - iocg->hweight_after_donation); continue; } parent = iocg->ancestors[iocg->level - 1]; /* b' = gamma * b_f + b_t' */ iocg->hweight_inuse = DIV64_U64_ROUND_UP( (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), WEIGHT_ONE) + iocg->hweight_after_donation; /* w' = s' * b' / b'_p */ inuse = DIV64_U64_ROUND_UP( (u64)parent->child_adjusted_sum * iocg->hweight_inuse, parent->hweight_inuse); /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ st = DIV64_U64_ROUND_UP( iocg->child_active_sum * iocg->hweight_donating, iocg->hweight_active); sf = iocg->child_active_sum - st; wpt = DIV64_U64_ROUND_UP( (u64)iocg->active * iocg->hweight_donating, iocg->hweight_active); wptp = DIV64_U64_ROUND_UP( (u64)inuse * iocg->hweight_after_donation, iocg->hweight_inuse); iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); } /* * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and * we can finally determine leaf adjustments. */ list_for_each_entry(iocg, surpluses, surplus_list) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; u32 inuse; /* * In-debt iocgs participated in the donation calculation with * the minimum target hweight_inuse. Configuring inuse * accordingly would work fine but debt handling expects * @iocg->inuse stay at the minimum and we don't wanna * interfere. */ if (iocg->abs_vdebt) { WARN_ON_ONCE(iocg->inuse > 1); continue; } /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, parent->hweight_inuse); TRACE_IOCG_PATH(inuse_transfer, iocg, now, iocg->inuse, inuse, iocg->hweight_inuse, iocg->hweight_after_donation); __propagate_weights(iocg, iocg->active, inuse, true, now); } /* walk list should be dissolved after use */ list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) list_del_init(&iocg->walk_list); } /* * A low weight iocg can amass a large amount of debt, for example, when * anonymous memory gets reclaimed aggressively. If the system has a lot of * memory paired with a slow IO device, the debt can span multiple seconds or * more. If there are no other subsequent IO issuers, the in-debt iocg may end * up blocked paying its debt while the IO device is idle. * * The following protects against such cases. If the device has been * sufficiently idle for a while, the debts are halved and delays are * recalculated. */ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { struct ioc_gq *iocg; u64 dur, usage_pct, nr_cycles, nr_cycles_shift; /* if no debtor, reset the cycle */ if (!nr_debtors) { ioc->dfgv_period_at = now->now; ioc->dfgv_period_rem = 0; ioc->dfgv_usage_us_sum = 0; return; } /* * Debtors can pass through a lot of writes choking the device and we * don't want to be forgiving debts while the device is struggling from * write bursts. If we're missing latency targets, consider the device * fully utilized. */ if (ioc->busy_level > 0) usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); ioc->dfgv_usage_us_sum += usage_us_sum; if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) return; /* * At least DFGV_PERIOD has passed since the last period. Calculate the * average usage and reset the period counters. */ dur = now->now - ioc->dfgv_period_at; usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); ioc->dfgv_period_at = now->now; ioc->dfgv_usage_us_sum = 0; /* if was too busy, reset everything */ if (usage_pct > DFGV_USAGE_PCT) { ioc->dfgv_period_rem = 0; return; } /* * Usage is lower than threshold. Let's forgive some debts. Debt * forgiveness runs off of the usual ioc timer but its period usually * doesn't match ioc's. Compensate the difference by performing the * reduction as many times as would fit in the duration since the last * run and carrying over the left-over duration in @ioc->dfgv_period_rem * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive * reductions is doubled. */ nr_cycles = dur + ioc->dfgv_period_rem; ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 __maybe_unused old_debt, __maybe_unused old_delay; if (!iocg->abs_vdebt && !iocg->delay) continue; spin_lock(&iocg->waitq.lock); old_debt = iocg->abs_vdebt; old_delay = iocg->delay; nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); if (iocg->abs_vdebt) iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; if (iocg->delay) iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; iocg_kick_waitq(iocg, true, now); TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct, old_debt, iocg->abs_vdebt, old_delay, iocg->delay); spin_unlock(&iocg->waitq.lock); } } /* * Check the active iocgs' state to avoid oversleeping and deactive * idle iocgs. * * Since waiters determine the sleep durations based on the vrate * they saw at the time of sleep, if vrate has increased, some * waiters could be sleeping for too long. Wake up tardy waiters * which should have woken up in the last period and expire idle * iocgs. */ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) { int nr_debtors = 0; struct ioc_gq *iocg, *tiocg; list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg->delay && !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); /* flush wait and indebt stat deltas */ if (iocg->wait_since) { iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, true, now); if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ u64 vtime = atomic64_read(&iocg->vtime); s64 excess; /* * @iocg has been inactive for a full duration and will * have a high budget. Account anything above target as * error and throw away. On reactivation, it'll start * with the target budget. */ excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { u32 old_hwi; current_hweight(iocg, NULL, &old_hwi); ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } TRACE_IOCG_PATH(iocg_idle, iocg, now, atomic64_read(&iocg->active_period), atomic64_read(&ioc->cur_period), vtime); __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } commit_weights(ioc); return nr_debtors; } static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; u32 ppm_rthr; u32 ppm_wthr; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; int prev_busy_level; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); /* take care of active iocgs */ spin_lock_irq(&ioc->lock); ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; ioc_now(ioc, &now); period_vtime = now.vnow - ioc->period_at_vtime; if (WARN_ON_ONCE(!period_vtime)) { spin_unlock_irq(&ioc->lock); return; } nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation * below needs updated usage stat. Let's bring stat up-to-date. */ iocg_flush_stat(&ioc->active_iocgs, &now); /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 vdone, vtime, usage_us; u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent * iocgs from accumulating a large amount of budget. */ vdone = atomic64_read(&iocg->done_vtime); vtime = atomic64_read(&iocg->vtime); current_hweight(iocg, &hw_active, &hw_inuse); /* * Latency QoS detection doesn't account for IOs which are * in-flight for longer than a period. Detect them by * comparing vdone against period start. If lagging behind * IOs from past periods, don't increase vrate. */ if ((ppm_rthr != MILLION || ppm_wthr != MILLION) && !atomic_read(&iocg_to_blkg(iocg)->use_delay) && time_after64(vtime, vdone) && time_after64(vtime, now.vnow - MAX_LAGGING_PERIODS * period_vtime) && time_before64(vdone, now.vnow - period_vtime)) nr_lagging++; /* * Determine absolute usage factoring in in-flight IOs to avoid * high-latency completions appearing as idle. */ usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { u32 hwa, old_hwi, hwm, new_hwi, usage; u64 usage_dur; if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( cost_to_abs_cost(vtime - vdone, hw_inuse), ioc->vtime_base_rate); usage_us = max(usage_us, inflight_us); } /* convert to hweight based usage ratio */ if (time_after64(iocg->activated_at, ioc->period_at)) usage_dur = max_t(u64, now.now - iocg->activated_at, 1); else usage_dur = max_t(u64, now.now - ioc->period_at, 1); usage = clamp_t(u32, DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, usage_dur), 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. * Determine the donation amount. */ current_hweight(iocg, &hwa, &old_hwi); hwm = current_hweight_max(iocg); new_hwi = hweight_after_donation(iocg, old_hwi, hwm, usage, &now); /* * Donation calculation assumes hweight_after_donation * to be positive, a condition that a donor w/ hwa < 2 * can't meet. Don't bother with donation if hwa is * below 2. It's not gonna make a meaningful difference * anyway. */ if (new_hwi < hwm && hwa >= 2) { iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); } else if (!iocg->abs_vdebt) { /* * @iocg doesn't have enough to donate. Reset * its inuse to active. * * Don't reset debtors as their inuse's are * owned by debt handling. This shouldn't affect * donation calculuation in any meaningful way * as @iocg doesn't have a meaningful amount of * share anyway. */ TRACE_IOCG_PATH(inuse_shortage, iocg, &now, iocg->inuse, iocg->active, iocg->hweight_inuse, new_hwi); __propagate_weights(iocg, iocg->active, iocg->active, true, &now); nr_shortages++; } } else { /* genuinely short on vtime */ nr_shortages++; } } if (!list_empty(&surpluses) && nr_shortages) transfer_surpluses(&surpluses, &now); commit_weights(ioc); /* surplus list should be dissolved after use */ list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) list_del_init(&iocg->surplus_list); /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing * and experiencing shortages but not surpluses, we're too stingy * and should increase vtime rate. */ prev_busy_level = ioc->busy_level; if (rq_wait_pct > RQ_WAIT_BUSY_PCT || missed_ppm[READ] > ppm_rthr || missed_ppm[WRITE] > ppm_wthr) { /* clearly missing QoS targets, slow down vrate */ ioc->busy_level = max(ioc->busy_level, 0); ioc->busy_level++; } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { /* QoS targets are being met with >25% margin */ if (nr_shortages) { /* * We're throttling while the device has spare * capacity. If vrate was being slowed down, stop. */ ioc->busy_level = min(ioc->busy_level, 0); /* * If there are IOs spanning multiple periods, wait * them out before pushing the device harder. */ if (!nr_lagging) ioc->busy_level--; } else { /* * Nobody is being throttled and the users aren't * issuing enough IOs to saturate the device. We * simply don't know how close the device is to * saturation. Coast. */ ioc->busy_level = 0; } } else { /* inside the hysterisis margin, we're good */ ioc->busy_level = 0; } ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now); /* * This period is done. Move onto the next one. If nothing's * going on with the device, stop the timer. */ atomic64_inc(&ioc->cur_period); if (ioc->running != IOC_STOP) { if (!list_empty(&ioc->active_iocgs)) { ioc_start_period(ioc, &now); } else { ioc->busy_level = 0; ioc->vtime_err = 0; ioc->running = IOC_IDLE; } ioc_refresh_vrate(ioc, &now); } spin_unlock_irq(&ioc->lock); } static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, u64 abs_cost, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct ioc_margins *margins = &ioc->margins; u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; unsigned long flags; current_hweight(iocg, NULL, &hwi); old_hwi = hwi; cost = abs_cost_to_cost(abs_cost, hwi); margin = now->vnow - vtime - cost; /* debt handling owns inuse for debtors */ if (iocg->abs_vdebt) return cost; /* * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || iocg->inuse == iocg->active) return cost; spin_lock_irqsave(&ioc->lock, flags); /* we own inuse only when @iocg is in the normal active state */ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { spin_unlock_irqrestore(&ioc->lock, flags); return cost; } /* * Bump up inuse till @abs_cost fits in the existing budget. * adj_step must be determined after acquiring ioc->lock - we might * have raced and lost to another thread for activation and could * be reading 0 iocg->active before ioc->lock which will lead to * infinite loop. */ new_inuse = iocg->inuse; adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); do { new_inuse = new_inuse + adj_step; propagate_weights(iocg, iocg->active, new_inuse, true, now); current_hweight(iocg, NULL, &hwi); cost = abs_cost_to_cost(abs_cost, hwi); } while (time_after64(vtime + cost, now->vnow) && iocg->inuse != iocg->active); spin_unlock_irqrestore(&ioc->lock, flags); TRACE_IOCG_PATH(inuse_adjust, iocg, now, old_inuse, iocg->inuse, old_hwi, hwi); return cost; } static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, bool is_merge, u64 *costp) { struct ioc *ioc = iocg->ioc; u64 coef_seqio, coef_randio, coef_page; u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1); u64 seek_pages = 0; u64 cost = 0; /* Can't calculate cost for empty bio */ if (!bio->bi_iter.bi_size) goto out; switch (bio_op(bio)) { case REQ_OP_READ: coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; break; case REQ_OP_WRITE: coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; break; default: goto out; } if (iocg->cursor) { seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); seek_pages >>= IOC_SECT_TO_PAGE_SHIFT; } if (!is_merge) { if (seek_pages > LCOEF_RANDIO_PAGES) { cost += coef_randio; } else { cost += coef_seqio; } } cost += pages * coef_page; out: *costp = cost; } static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) { u64 cost; calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); return cost; } static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc, u64 *costp) { unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT; switch (req_op(rq)) { case REQ_OP_READ: *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; break; case REQ_OP_WRITE: *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; break; default: *costp = 0; } } static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc) { u64 cost; calc_size_vtime_cost_builtin(rq, ioc, &cost); return cost; } static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; struct ioc *ioc = rqos_to_ioc(rqos); struct ioc_gq *iocg = blkg_to_iocg(blkg); struct ioc_now now; struct iocg_wait wait; u64 abs_cost, cost, vtime; bool use_debt, ioc_locked; unsigned long flags; /* bypass IOs if disabled, still initializing, or for root cgroup */ if (!ioc->enabled || !iocg || !iocg->level) return; /* calculate the absolute vtime cost */ abs_cost = calc_vtime_cost(bio, iocg, false); if (!abs_cost) return; if (!iocg_activate(iocg, &now)) return; iocg->cursor = bio_end_sector(bio); vtime = atomic64_read(&iocg->vtime); cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* * If no one's waiting and within budget, issue right away. The * tests are racy but the races aren't systemic - we only miss once * in a while which is fine. */ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * We're over budget. This can be handled in two ways. IOs which may * cause priority inversions are punted to @ioc->aux_iocg and charged as * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine * whether debt handling is needed and acquire locks accordingly. */ use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); retry_lock: iocg_lock(iocg, ioc_locked, &flags); /* * @iocg must stay activated for debt and waitq handling. Deactivation * is synchronized against both ioc->lock and waitq.lock and we won't * get deactivated as long as we're waiting or has debt, so we're good * if we're activated here. In the unlikely cases that we aren't, just * issue the IO. */ if (unlikely(list_empty(&iocg->active_list))) { iocg_unlock(iocg, ioc_locked, &flags); iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * We're over budget. If @bio has to be issued regardless, remember * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay * off the debt before waking more IOs. * * This way, the debt is continuously paid off each period with the * actual budget available to the cgroup. If we just wound vtime, we * would incorrectly use the current hw_inuse for the entire amount * which, for example, can lead to the cgroup staying blocked for a * long time even with substantially raised hw_inuse. * * An iocg with vdebt should stay online so that the timer can keep * deducting its vdebt and [de]activate use_delay mechanism * accordingly. We don't want to race against the timer trying to * clear them and leave @iocg inactive w/ dangling use_delay heavily * penalizing the cgroup and its descendants. */ if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; } /* guarantee that iocgs w/ waiters have maximum inuse */ if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { if (!ioc_locked) { iocg_unlock(iocg, false, &flags); ioc_locked = true; goto retry_lock; } propagate_weights(iocg, iocg->active, iocg->active, true, &now); } /* * Append self to the waitq and schedule the wakeup timer if we're * the first waiter. The timer duration is calculated based on the * current vrate. vtime and hweight changes can make it too short * or too long. Each wait entry records the absolute cost it's * waiting for to allow re-evaluation using a custom wait entry. * * If too short, the timer simply reschedules itself. If too long, * the period timer will notice and trigger wakeups. * * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ init_wait_func(&wait.wait, iocg_wake_fn); wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); iocg_kick_waitq(iocg, ioc_locked, &now); iocg_unlock(iocg, ioc_locked, &flags); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } /* waker already committed us, proceed */ finish_wait(&iocg->waitq, &wait.wait); } static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); struct ioc *ioc = rqos_to_ioc(rqos); sector_t bio_end = bio_end_sector(bio); struct ioc_now now; u64 vtime, abs_cost, cost; unsigned long flags; /* bypass if disabled, still initializing, or for root cgroup */ if (!ioc->enabled || !iocg || !iocg->level) return; abs_cost = calc_vtime_cost(bio, iocg, true); if (!abs_cost) return; ioc_now(ioc, &now); vtime = atomic64_read(&iocg->vtime); cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* update cursor if backmerging into the request at the cursor */ if (blk_rq_pos(rq) < bio_end && blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) iocg->cursor = bio_end; /* * Charge if there's enough vtime budget and the existing request has * cost assigned. */ if (rq->bio && rq->bio->bi_iocost_cost && time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * Otherwise, account it as debt if @iocg is online, which it should * be for the vast majority of cases. See debt handling in * ioc_rqos_throttle() for details. */ spin_lock_irqsave(&ioc->lock, flags); spin_lock(&iocg->waitq.lock); if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); } spin_unlock(&iocg->waitq.lock); spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); if (iocg && bio->bi_iocost_cost) atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); } static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); struct ioc_pcpu_stat *ccs; u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) return; switch (req_op(rq)) { case REQ_OP_READ: pidx = QOS_RLAT; rw = READ; break; case REQ_OP_WRITE: pidx = QOS_WLAT; rw = WRITE; break; default: return; } on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); ccs = get_cpu_ptr(ioc->pcpu_stat); if (on_q_ns <= size_nsec || on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) local_inc(&ccs->missed[rw].nr_met); else local_inc(&ccs->missed[rw].nr_missed); local64_add(rq_wait_ns, &ccs->rq_wait_ns); put_cpu_ptr(ccs); } static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); spin_lock_irq(&ioc->lock); ioc_refresh_params(ioc, false); spin_unlock_irq(&ioc->lock); } static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); spin_lock_irq(&ioc->lock); ioc->running = IOC_STOP; spin_unlock_irq(&ioc->lock); timer_shutdown_sync(&ioc->timer); free_percpu(ioc->pcpu_stat); kfree(ioc); } static const struct rq_qos_ops ioc_rqos_ops = { .throttle = ioc_rqos_throttle, .merge = ioc_rqos_merge, .done_bio = ioc_rqos_done_bio, .done = ioc_rqos_done, .queue_depth_changed = ioc_rqos_queue_depth_changed, .exit = ioc_rqos_exit, }; static int blk_iocost_init(struct gendisk *disk) { struct ioc *ioc; int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) return -ENOMEM; ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); if (!ioc->pcpu_stat) { kfree(ioc); return -ENOMEM; } for_each_possible_cpu(cpu) { struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { local_set(&ccs->missed[i].nr_met, 0); local_set(&ccs->missed[i].nr_missed, 0); } local64_set(&ccs->rq_wait_ns, 0); } spin_lock_init(&ioc->lock); timer_setup(&ioc->timer, ioc_timer_fn, 0); INIT_LIST_HEAD(&ioc->active_iocgs); ioc->running = IOC_IDLE; ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); spin_lock_irq(&ioc->lock); ioc->autop_idx = AUTOP_INVALID; ioc_refresh_params_disk(ioc, true, disk); spin_unlock_irq(&ioc->lock); /* * rqos must be added before activation to allow ioc_pd_init() to * lookup the ioc from q. This means that the rqos methods may get * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); if (ret) goto err_free_ioc; ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); if (ret) goto err_del_qos; return 0; err_del_qos: rq_qos_del(&ioc->rqos); err_free_ioc: free_percpu(ioc->pcpu_stat); kfree(ioc); return ret; } static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) { struct ioc_cgrp *iocc; iocc = kzalloc(sizeof(struct ioc_cgrp), gfp); if (!iocc) return NULL; iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; return &iocc->cpd; } static void ioc_cpd_free(struct blkcg_policy_data *cpd) { kfree(container_of(cpd, struct ioc_cgrp, cpd)); } static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, disk->node_id); if (!iocg) return NULL; iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); if (!iocg->pcpu_stat) { kfree(iocg); return NULL; } return &iocg->pd; } static void ioc_pd_init(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); struct ioc *ioc = q_to_ioc(blkg->q); struct ioc_now now; struct blkcg_gq *tblkg; unsigned long flags; ioc_now(ioc, &now); iocg->ioc = ioc; atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); INIT_LIST_HEAD(&iocg->walk_list); INIT_LIST_HEAD(&iocg->surplus_list); iocg->hweight_active = WEIGHT_ONE; iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->level = blkg->blkcg->css.cgroup->level; for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { struct ioc_gq *tiocg = blkg_to_iocg(tblkg); iocg->ancestors[tiocg->level] = tiocg; } spin_lock_irqsave(&ioc->lock, flags); weight_updated(iocg, &now); spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_pd_free(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; unsigned long flags; if (ioc) { spin_lock_irqsave(&ioc->lock, flags); if (!list_empty(&iocg->active_list)) { struct ioc_now now; ioc_now(ioc, &now); propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } WARN_ON_ONCE(!list_empty(&iocg->walk_list)); WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); } free_percpu(iocg->pcpu_stat); kfree(iocg); } static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; if (!ioc->enabled) return; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", iocg->last_stat.wait_us, iocg->last_stat.indebt_us, iocg->last_stat.indelay_us); } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc_gq *iocg = pd_to_iocg(pd); if (dname && iocg->cfg_weight) seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); return 0; } static int ioc_weight_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); struct blkg_conf_ctx ctx; struct ioc_now now; struct ioc_gq *iocg; u32 v; int ret; if (!strchr(buf, ':')) { struct blkcg_gq *blkg; if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v)) return -EINVAL; if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) return -EINVAL; spin_lock_irq(&blkcg->lock); iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { spin_lock(&iocg->ioc->lock); ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); } } spin_unlock_irq(&blkcg->lock); return nbytes; } blkg_conf_init(&ctx, buf); ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); if (ret) goto err; iocg = blkg_to_iocg(ctx.blkg); if (!strncmp(ctx.body, "default", 7)) { v = 0; } else { if (!sscanf(ctx.body, "%u", &v)) goto einval; if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) goto einval; } spin_lock(&iocg->ioc->lock); iocg->cfg_weight = v * WEIGHT_ONE; ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); blkg_conf_exit(&ctx); return nbytes; einval: ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc *ioc = pd_to_iocg(pd)->ioc; if (!dname) return 0; spin_lock(&ioc->lock); seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", ioc->params.qos[QOS_RPPM] / 10000, ioc->params.qos[QOS_RPPM] % 10000 / 100, ioc->params.qos[QOS_RLAT], ioc->params.qos[QOS_WPPM] / 10000, ioc->params.qos[QOS_WPPM] % 10000 / 100, ioc->params.qos[QOS_WLAT], ioc->params.qos[QOS_MIN] / 10000, ioc->params.qos[QOS_MIN] % 10000 / 100, ioc->params.qos[QOS_MAX] / 10000, ioc->params.qos[QOS_MAX] % 10000 / 100); spin_unlock(&ioc->lock); return 0; } static int ioc_qos_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static const match_table_t qos_ctrl_tokens = { { QOS_ENABLE, "enable=%u" }, { QOS_CTRL, "ctrl=%s" }, { NR_QOS_CTRL_PARAMS, NULL }, }; static const match_table_t qos_tokens = { { QOS_RPPM, "rpct=%s" }, { QOS_RLAT, "rlat=%u" }, { QOS_WPPM, "wpct=%s" }, { QOS_WLAT, "wlat=%u" }, { QOS_MIN, "min=%s" }, { QOS_MAX, "max=%s" }, { NR_QOS_PARAMS, NULL }, }; static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct blkg_conf_ctx ctx; struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; unsigned long memflags; int ret; blkg_conf_init(&ctx, input); memflags = blkg_conf_open_bdev_frozen(&ctx); if (IS_ERR_VALUE(memflags)) { ret = memflags; goto err; } body = ctx.body; disk = ctx.bdev->bd_disk; if (!queue_is_mq(disk->queue)) { ret = -EOPNOTSUPP; goto err; } ioc = q_to_ioc(disk->queue); if (!ioc) { ret = blk_iocost_init(disk); if (ret) goto err; ioc = q_to_ioc(disk->queue); } blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; user = ioc->user_qos_params; while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; s64 v; if (!*p) continue; switch (match_token(p, qos_ctrl_tokens, args)) { case QOS_ENABLE: if (match_u64(&args[0], &v)) goto einval; enable = v; continue; case QOS_CTRL: match_strlcpy(buf, &args[0], sizeof(buf)); if (!strcmp(buf, "auto")) user = false; else if (!strcmp(buf, "user")) user = true; else goto einval; continue; } tok = match_token(p, qos_tokens, args); switch (tok) { case QOS_RPPM: case QOS_WPPM: if (match_strlcpy(buf, &args[0], sizeof(buf)) >= sizeof(buf)) goto einval; if (cgroup_parse_float(buf, 2, &v)) goto einval; if (v < 0 || v > 10000) goto einval; qos[tok] = v * 100; break; case QOS_RLAT: case QOS_WLAT: if (match_u64(&args[0], &v)) goto einval; qos[tok] = v; break; case QOS_MIN: case QOS_MAX: if (match_strlcpy(buf, &args[0], sizeof(buf)) >= sizeof(buf)) goto einval; if (cgroup_parse_float(buf, 2, &v)) goto einval; if (v < 0) goto einval; qos[tok] = clamp_t(s64, v * 100, VRATE_MIN_PPM, VRATE_MAX_PPM); break; default: goto einval; } user = true; } if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; if (enable && !ioc->enabled) { blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; } else if (!enable && ioc->enabled) { blk_stat_disable_accounting(disk->queue); blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } if (user) { memcpy(ioc->params.qos, qos, sizeof(qos)); ioc->user_qos_params = true; } else { ioc->user_qos_params = false; } ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); if (enable) wbt_disable_default(disk); else wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(disk->queue); ret = -EINVAL; err: blkg_conf_exit_frozen(&ctx, memflags); return ret; } static u64 ioc_cost_model_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc *ioc = pd_to_iocg(pd)->ioc; u64 *u = ioc->params.i_lcoefs; if (!dname) return 0; spin_lock(&ioc->lock); seq_printf(sf, "%s ctrl=%s model=linear " "rbps=%llu rseqiops=%llu rrandiops=%llu " "wbps=%llu wseqiops=%llu wrandiops=%llu\n", dname, ioc->user_cost_model ? "user" : "auto", u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); spin_unlock(&ioc->lock); return 0; } static int ioc_cost_model_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static const match_table_t cost_ctrl_tokens = { { COST_CTRL, "ctrl=%s" }, { COST_MODEL, "model=%s" }, { NR_COST_CTRL_PARAMS, NULL }, }; static const match_table_t i_lcoef_tokens = { { I_LCOEF_RBPS, "rbps=%u" }, { I_LCOEF_RSEQIOPS, "rseqiops=%u" }, { I_LCOEF_RRANDIOPS, "rrandiops=%u" }, { I_LCOEF_WBPS, "wbps=%u" }, { I_LCOEF_WSEQIOPS, "wseqiops=%u" }, { I_LCOEF_WRANDIOPS, "wrandiops=%u" }, { NR_I_LCOEFS, NULL }, }; static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct blkg_conf_ctx ctx; struct request_queue *q; unsigned int memflags; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *body, *p; int ret; blkg_conf_init(&ctx, input); ret = blkg_conf_open_bdev(&ctx); if (ret) goto err; body = ctx.body; q = bdev_get_queue(ctx.bdev); if (!queue_is_mq(q)) { ret = -EOPNOTSUPP; goto err; } ioc = q_to_ioc(q); if (!ioc) { ret = blk_iocost_init(ctx.bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(q); } memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; u64 v; if (!*p) continue; switch (match_token(p, cost_ctrl_tokens, args)) { case COST_CTRL: match_strlcpy(buf, &args[0], sizeof(buf)); if (!strcmp(buf, "auto")) user = false; else if (!strcmp(buf, "user")) user = true; else goto einval; continue; case COST_MODEL: match_strlcpy(buf, &args[0], sizeof(buf)); if (strcmp(buf, "linear")) goto einval; continue; } tok = match_token(p, i_lcoef_tokens, args); if (tok == NR_I_LCOEFS) goto einval; if (match_u64(&args[0], &v)) goto einval; u[tok] = v; user = true; } if (user) { memcpy(ioc->params.i_lcoefs, u, sizeof(u)); ioc->user_cost_model = true; } else { ioc->user_cost_model = false; } ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q, memflags); blkg_conf_exit(&ctx); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q, memflags); ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static struct cftype ioc_files[] = { { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = ioc_weight_show, .write = ioc_weight_write, }, { .name = "cost.qos", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = ioc_qos_show, .write = ioc_qos_write, }, { .name = "cost.model", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = ioc_cost_model_show, .write = ioc_cost_model_write, }, {} }; static struct blkcg_policy blkcg_policy_iocost = { .dfl_cftypes = ioc_files, .cpd_alloc_fn = ioc_cpd_alloc, .cpd_free_fn = ioc_cpd_free, .pd_alloc_fn = ioc_pd_alloc, .pd_init_fn = ioc_pd_init, .pd_free_fn = ioc_pd_free, .pd_stat_fn = ioc_pd_stat, }; static int __init ioc_init(void) { return blkcg_policy_register(&blkcg_policy_iocost); } static void __exit ioc_exit(void) { blkcg_policy_unregister(&blkcg_policy_iocost); } module_init(ioc_init); module_exit(ioc_exit);
6 6 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 // SPDX-License-Identifier: GPL-2.0 /* * Interface for controlling IO bandwidth on a request queue * * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> #include "blk.h" #include "blk-cgroup-rwstat.h" #include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 /* Total max dispatch from all groups in one round */ #define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) struct throtl_data { /* service tree for active throtl groups */ struct throtl_service_queue service_queue; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; unsigned int throtl_slice; /* Work for dispatching throttled bios */ struct work_struct dispatch_work; bool track_bio_latency; }; static void throtl_pending_timer_fn(struct timer_list *t); static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); } /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest * * Return the throtl_grp @sq belongs to. If @sq is the top-level one * embedded in throtl_data, %NULL is returned. */ static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) { if (sq && sq->parent_sq) return container_of(sq, struct throtl_grp, service_queue); else return NULL; } /** * sq_to_td - return throtl_data the specified service queue belongs to * @sq: the throtl_service_queue of interest * * A service_queue can be embedded in either a throtl_grp or throtl_data. * Determine the associated throtl_data accordingly and return it. */ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) { struct throtl_grp *tg = sq_to_tg(sq); if (tg) return tg->td; else return container_of(sq, struct throtl_data, service_queue); } static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return U64_MAX; return tg->bps[rw]; } static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; return tg->iops[rw]; } /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported * @fmt: printf format string * @args: printf args * * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a * throtl_grp; otherwise, just "throtl". */ #define throtl_log(sq, fmt, args...) do { \ struct throtl_grp *__tg = sq_to_tg((sq)); \ struct throtl_data *__td = sq_to_td((sq)); \ \ (void)__td; \ if (likely(!blk_trace_note_message_enabled(__td->queue))) \ break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ } while (0) static inline unsigned int throtl_bio_data_size(struct bio *bio) { /* assume it's one sector */ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) return 512; return bio->bi_iter.bi_size; } static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); bio_list_init(&qn->bios_bps); bio_list_init(&qn->bios_iops); qn->tg = tg; } /** * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to * @sq: the service_queue @qn belongs to * * Add @bio to @qn and put @qn on @sq->queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details. */ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, struct throtl_service_queue *sq) { bool rw = bio_data_dir(bio); /* * Split bios have already been throttled by bps, so they are * directly queued into the iops path. */ if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) || bio_flagged(bio, BIO_BPS_THROTTLED)) { bio_list_add(&qn->bios_iops, bio); sq->nr_queued_iops[rw]++; } else { bio_list_add(&qn->bios_bps, bio); sq->nr_queued_bps[rw]++; } if (list_empty(&qn->node)) { list_add_tail(&qn->node, &sq->queued[rw]); blkg_get(tg_to_blkg(qn->tg)); } } /** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek * * Always take a bio from the head of the iops queue first. If the queue is * empty, we then take it from the bps queue to maintain the overall idea of * fetching bios from the head. */ static struct bio *throtl_peek_queued(struct list_head *queued) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_peek(&qn->bios_iops); if (!bio) bio = bio_list_peek(&qn->bios_bps); WARN_ON_ONCE(!bio); return bio; } /** * throtl_pop_queued - pop the first bio form a qnode list * @sq: the service_queue to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put * @rw: read/write * * Pop the first bio from the qnode list @sq->queued. Note that we firstly * focus on the iops list because bios are ultimately dispatched from it. * After popping, the first qnode is removed from @sq->queued if empty or moved * to the end of @sq->queued so that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it. */ static struct bio *throtl_pop_queued(struct throtl_service_queue *sq, struct throtl_grp **tg_to_put, bool rw) { struct list_head *queued = &sq->queued[rw]; struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_pop(&qn->bios_iops); if (bio) { sq->nr_queued_iops[rw]--; } else { bio = bio_list_pop(&qn->bios_bps); if (bio) sq->nr_queued_bps[rw]--; } WARN_ON_ONCE(!bio); if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) { list_del_init(&qn->node); if (tg_to_put) *tg_to_put = qn->tg; else blkg_put(tg_to_blkg(qn->tg)); } else { list_move_tail(&qn->node, queued); } return bio; } /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[READ]); INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct throtl_grp *tg; int rw; tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); if (!tg) return NULL; if (blkg_rwstat_init(&tg->stat_bytes, gfp)) goto err_free_tg; if (blkg_rwstat_init(&tg->stat_ios, gfp)) goto err_exit_stat_bytes; throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { throtl_qnode_init(&tg->qnode_on_self[rw], tg); throtl_qnode_init(&tg->qnode_on_parent[rw], tg); } RB_CLEAR_NODE(&tg->rb_node); tg->bps[READ] = U64_MAX; tg->bps[WRITE] = U64_MAX; tg->iops[READ] = UINT_MAX; tg->iops[WRITE] = UINT_MAX; return &tg->pd; err_exit_stat_bytes: blkg_rwstat_exit(&tg->stat_bytes); err_free_tg: kfree(tg); return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M * read_bps limit is set on a parent group, summary bps of * parent group and its subtree groups can't exceed 16M for the * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if * they're all separate root groups right below throtl_data. * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } /* * Set has_rules[] if @tg or any of its parents have limits configured. * This doesn't require walking up to the top of the hierarchy as the * parent's has_rules[] is guaranteed to be correct. */ static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); int rw; for (rw = READ; rw <= WRITE; rw++) { tg->has_rules_iops[rw] = (parent_tg && parent_tg->has_rules_iops[rw]) || tg_iops_limit(tg, rw) != UINT_MAX; tg->has_rules_bps[rw] = (parent_tg && parent_tg->has_rules_bps[rw]) || tg_bps_limit(tg, rw) != U64_MAX; } } static void throtl_pd_online(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ tg_update_has_rules(tg); } static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); timer_delete_sync(&tg->service_queue.pending_timer); blkg_rwstat_exit(&tg->stat_bytes); blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { struct rb_node *n; n = rb_first_cached(&parent_sq->pending_tree); WARN_ON_ONCE(!n); if (!n) return NULL; return rb_entry_tg(n); } static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *parent_sq) { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) { struct throtl_grp *tg; tg = throtl_rb_first(parent_sq); if (!tg) return; parent_sq->first_pending_disptime = tg->disptime; } static void tg_service_queue_add(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; struct rb_node *parent = NULL; struct throtl_grp *__tg; unsigned long key = tg->disptime; bool leftmost = true; while (*node != NULL) { parent = *node; __tg = rb_entry_tg(parent); if (time_before(key, __tg->disptime)) node = &parent->rb_left; else { node = &parent->rb_right; leftmost = false; } } rb_link_node(&tg->rb_node, parent, node); rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, leftmost); } static void throtl_enqueue_tg(struct throtl_grp *tg) { if (!(tg->flags & THROTL_TG_PENDING)) { tg_service_queue_add(tg); tg->flags |= THROTL_TG_PENDING; tg->service_queue.parent_sq->nr_pending++; } } static void throtl_dequeue_tg(struct throtl_grp *tg) { if (tg->flags & THROTL_TG_PENDING) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; throtl_rb_erase(&tg->rb_node, parent_sq); --parent_sq->nr_pending; tg->flags &= ~THROTL_TG_PENDING; } } /* Call with queue lock held */ static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; /* * Since we are adjusting the throttle limit dynamically, the sleep * time calculated according to previous limit might be invalid. It's * possible the cgroup sleep time is very long and no other cgroups * have IO running so notify the limit changes. Make sure the cgroup * doesn't sleep too long to avoid the missed notification. */ if (time_after(expires, max_expire)) expires = max_expire; mod_timer(&sq->pending_timer, expires); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", expires - jiffies, jiffies); } /** * throtl_schedule_next_dispatch - schedule the next dispatch cycle * @sq: the service_queue to schedule dispatch for * @force: force scheduling * * Arm @sq->pending_timer so that the next dispatch cycle starts on the * dispatch time of the first pending child. Returns %true if either timer * is armed or there's no pending child left. %false if the current * dispatch window is still open and the caller should continue * dispatching. * * If @force is %true, the dispatch timer is always scheduled and this * function is guaranteed to return %true. This is to be used when the * caller can't dispatch itself and needs to invoke pending_timer * unconditionally. Note that forced scheduling is likely to induce short * delay before dispatch starts even if @sq->first_pending_disptime is not * in the future and thus shouldn't be used in hot paths. */ static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, bool force) { /* any pending children left? */ if (!sq->nr_pending) return true; update_min_dispatch_time(sq); /* is the next dispatch time in the future? */ if (force || time_after(sq->first_pending_disptime, jiffies)) { throtl_schedule_pending_timer(sq, sq->first_pending_disptime); return true; } /* tell the caller to continue dispatching */ return false; } static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, bool rw, unsigned long start) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used * that bandwidth. Do try to make use of that bandwidth while giving * credit. */ if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, bool clear) { if (clear) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; } tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { if (!time_before(tg->slice_end[rw], jiffy_end)) return; throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } /* Determine if previously allocated or extended slice is complete or not */ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) return false; return true; } static unsigned int sq_queued(struct throtl_service_queue *sq, int type) { return sq->nr_queued_bps[type] + sq->nr_queued_iops[type]; } static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { unsigned int io_allowed; u64 tmp; /* * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed. */ tmp = (u64)iops_limit * jiffy_elapsed; do_div(tmp, HZ); if (tmp > UINT_MAX) io_allowed = UINT_MAX; else io_allowed = tmp; return io_allowed; } static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) { /* * Can result be wider than 64 bits? * We check against 62, not 64, due to ilog2 truncation. */ if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) return U64_MAX; return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } static long long throtl_trim_bps(struct throtl_grp *tg, bool rw, unsigned long time_elapsed) { u64 bps_limit = tg_bps_limit(tg, rw); long long bytes_trim; if (bps_limit == U64_MAX) return 0; /* Need to consider the case of bytes_allowed overflow. */ bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed); if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) { bytes_trim = tg->bytes_disp[rw]; tg->bytes_disp[rw] = 0; } else { tg->bytes_disp[rw] -= bytes_trim; } return bytes_trim; } static int throtl_trim_iops(struct throtl_grp *tg, bool rw, unsigned long time_elapsed) { u32 iops_limit = tg_iops_limit(tg, rw); int io_trim; if (iops_limit == UINT_MAX) return 0; /* Need to consider the case of io_allowed overflow. */ io_trim = calculate_io_allowed(iops_limit, time_elapsed); if (io_trim <= 0 || tg->io_disp[rw] < io_trim) { io_trim = tg->io_disp[rw]; tg->io_disp[rw] = 0; } else { tg->io_disp[rw] -= io_trim; } return io_trim; } /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { unsigned long time_elapsed; long long bytes_trim; int io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); /* * If bps are unlimited (-1), then time slice don't get * renewed. Don't try to trim the slice if slice is used. A new * slice will start when appropriate. */ if (throtl_slice_used(tg, rw)) return; /* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); /* Don't trim slice until at least 2 slices are used */ if (time_elapsed < tg->td->throtl_slice * 2) return; /* * The bio submission time may be a few jiffies more than the expected * waiting time, due to 'extra_bytes' can't be divided in * tg_within_bps_limit(), and also due to timer wakeup delay. In this * case, adjust slice_start will discard the extra wait time, causing * lower rate than expected. Therefore, other than the above rounddown, * one extra slice is preserved for deviation. */ time_elapsed -= tg->td->throtl_slice; bytes_trim = throtl_trim_bps(tg, rw, time_elapsed); io_trim = throtl_trim_iops(tg, rw, time_elapsed); if (!bytes_trim && !io_trim) return; tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } static void __tg_update_carryover(struct throtl_grp *tg, bool rw, long long *bytes, int *ios) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); long long bytes_allowed; int io_allowed; /* * If the queue is empty, carryover handling is not needed. In such cases, * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch * of subsequent bios. The same handling applies when the previous BPS/IOPS * limit was set to max. */ if (sq_queued(&tg->service_queue, rw) == 0) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; return; } /* * If config is updated while bios are still throttled, calculate and * accumulate how many bytes/ios are waited across changes. And use the * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which * will be used to calculate new wait time under new configuration. * And we need to consider the case of bytes/io_allowed overflow. */ if (bps_limit != U64_MAX) { bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed); if (bytes_allowed > 0) *bytes = bytes_allowed - tg->bytes_disp[rw]; } if (iops_limit != UINT_MAX) { io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed); if (io_allowed > 0) *ios = io_allowed - tg->io_disp[rw]; } tg->bytes_disp[rw] = -*bytes; tg->io_disp[rw] = -*ios; } static void tg_update_carryover(struct throtl_grp *tg) { long long bytes[2] = {0}; int ios[2] = {0}; __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); /* see comments in struct throtl_grp for meaning of carryover. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, u32 iops_limit) { bool rw = bio_data_dir(bio); int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; /* make sure at least one io can be dispatched after waiting */ jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait; } static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, u64 bps_limit) { bool rw = bio_data_dir(bio); long long bytes_allowed; u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); /* Need to consider the case of bytes_allowed overflow. */ if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) || bytes_allowed < 0) return 0; /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); if (!jiffy_wait) jiffy_wait = 1; /* * This wait time is without taking into consideration the rounding * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); return jiffy_wait; } static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio) { unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ if (!bio_flagged(bio, BIO_BPS_THROTTLED) && !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) { bio_set_flag(bio, BIO_TG_BPS_THROTTLED); tg->bytes_disp[bio_data_dir(bio)] += bio_size; } } static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio) { bio_clear_flag(bio, BIO_TG_BPS_THROTTLED); tg->io_disp[bio_data_dir(bio)]++; } /* * If previous slice expired, start a new one otherwise renew/extend existing * slice to make sure it is at least throtl_slice interval long since now. New * slice is started only for empty throttle group. If there is queued bio, that * means there should be an active slice and it should be extended instead. */ static void tg_update_slice(struct throtl_grp *tg, bool rw) { if (throtl_slice_used(tg, rw) && sq_queued(&tg->service_queue, rw) == 0) throtl_start_new_slice(tg, rw, true); else throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); } static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); u64 bps_limit = tg_bps_limit(tg, rw); unsigned long bps_wait; /* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING || bio_flagged(bio, BIO_BPS_THROTTLED) || bio_flagged(bio, BIO_TG_BPS_THROTTLED)) return 0; tg_update_slice(tg, rw); bps_wait = tg_within_bps_limit(tg, bio, bps_limit); throtl_extend_slice(tg, rw, jiffies + bps_wait); return bps_wait; } static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); u32 iops_limit = tg_iops_limit(tg, rw); unsigned long iops_wait; if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING) return 0; tg_update_slice(tg, rw); iops_wait = tg_within_iops_limit(tg, bio, iops_limit); throtl_extend_slice(tg, rw, jiffies + iops_wait); return iops_wait; } /* * Returns approx number of jiffies to wait before this bio is with-in IO rate * and can be moved to other queue or dispatched. */ static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); unsigned long wait; /* * Currently whole state machine of group depends on first bio * queued in the group bio list. So one should not be calling * this function with a different bio if there are other bios * queued. */ BUG_ON(sq_queued(&tg->service_queue, rw) && bio != throtl_peek_queued(&tg->service_queue.queued[rw])); wait = tg_dispatch_bps_time(tg, bio); if (wait != 0) return wait; /* * Charge bps here because @bio will be directly placed into the * iops queue afterward. */ throtl_charge_bps_bio(tg, bio); return tg_dispatch_iops_time(tg, bio); } /** * throtl_add_bio_tg - add a bio to the specified throtl_grp * @bio: bio to add * @qn: qnode to use * @tg: the target throtl_grp * * Add @bio to @tg's service_queue using @qn. If @qn is not specified, * tg->qnode_on_self[] is used. */ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; bool rw = bio_data_dir(bio); if (!qn) qn = &tg->qnode_on_self[rw]; /* * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime(). */ if (sq_queued(sq, rw) == 0) tg->flags |= THROTL_TG_WAS_EMPTY; throtl_qnode_add_bio(bio, qn, sq); /* * Since we have split the queues, when the iops queue is * previously empty and a new @bio is added into the first @qn, * we also need to update the @tg->disptime. */ if (bio_flagged(bio, BIO_BPS_THROTTLED) && bio == throtl_peek_queued(&sq->queued[rw])) tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; throtl_enqueue_tg(tg); } static void tg_update_disptime(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned long read_wait = -1, write_wait = -1, min_wait, disptime; struct bio *bio; bio = throtl_peek_queued(&sq->queued[READ]); if (bio) read_wait = tg_dispatch_time(tg, bio); bio = throtl_peek_queued(&sq->queued[WRITE]); if (bio) write_wait = tg_dispatch_time(tg, bio); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; /* Update dispatch time */ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY; } static void start_parent_slice_with_credit(struct throtl_grp *child_tg, struct throtl_grp *parent_tg, bool rw) { if (throtl_slice_used(parent_tg, rw)) { throtl_start_new_slice_with_credit(parent_tg, rw, child_tg->slice_start[rw]); } } static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; struct throtl_service_queue *parent_sq = sq->parent_sq; struct throtl_grp *parent_tg = sq_to_tg(parent_sq); struct throtl_grp *tg_to_put = NULL; struct bio *bio; /* * @bio is being transferred from @tg to @parent_sq. Popping a bio * from @tg may put its reference and @parent_sq might end up * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq. */ bio = throtl_pop_queued(sq, &tg_to_put, rw); throtl_charge_iops_bio(tg, bio); /* * If our parent is another tg, we just need to transfer @bio to * the parent using throtl_add_bio_tg(). If our parent is * @td->service_queue, @bio is ready to be issued. Put it on its * bio_lists[] and decrease total number queued. The caller is * responsible for issuing these bios. */ if (parent_tg) { throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], parent_sq); BUG_ON(tg->td->nr_queued[rw] <= 0); tg->td->nr_queued[rw]--; } throtl_trim_slice(tg, rw); if (tg_to_put) blkg_put(tg_to_blkg(tg_to_put)); } static int throtl_dispatch_tg(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_dispatch_time(tg, bio) == 0) { tg_dispatch_one_bio(tg, READ); nr_reads++; if (nr_reads >= max_nr_reads) break; } while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && tg_dispatch_time(tg, bio) == 0) { tg_dispatch_one_bio(tg, WRITE); nr_writes++; if (nr_writes >= max_nr_writes) break; } return nr_reads + nr_writes; } static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) { unsigned int nr_disp = 0; while (1) { struct throtl_grp *tg; struct throtl_service_queue *sq; if (!parent_sq->nr_pending) break; tg = throtl_rb_first(parent_sq); if (!tg) break; if (time_before(jiffies, tg->disptime)) break; nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; if (sq_queued(sq, READ) || sq_queued(sq, WRITE)) tg_update_disptime(tg); else throtl_dequeue_tg(tg); if (nr_disp >= THROTL_QUANTUM) break; } return nr_disp; } /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced * * This timer is armed when a child throtl_grp with active bio's become * pending and queued on the service_queue's pending_tree and expires when * the first child throtl_grp should be dispatched. This function * dispatches bio's from the children throtl_grps to the parent * service_queue. * * If the parent's parent is another throtl_grp, dispatching is propagated * by either arming its pending_timer or repeating dispatch directly. If * the top-level service_tree is reached, throtl_data->dispatch_work is * kicked so that the ready bio's are issued. */ static void throtl_pending_timer_fn(struct timer_list *t) { struct throtl_service_queue *sq = timer_container_of(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); struct throtl_service_queue *parent_sq; struct request_queue *q; bool dispatched; int ret; /* throtl_data may be gone, so figure out request queue by blkg */ if (tg) q = tg->pd.blkg->q; else q = td->queue; spin_lock_irq(&q->queue_lock); if (!q->root_blkg) goto out_unlock; again: parent_sq = sq->parent_sq; dispatched = false; while (true) { unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ); unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE); throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w); ret = throtl_select_dispatch(sq); if (ret) { throtl_log(sq, "bios disp=%u", ret); dispatched = true; } if (throtl_schedule_next_dispatch(sq, false)) break; /* this dispatch windows is still open, relax and repeat */ spin_unlock_irq(&q->queue_lock); cpu_relax(); spin_lock_irq(&q->queue_lock); } if (!dispatched) goto out_unlock; if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ if (tg->flags & THROTL_TG_WAS_EMPTY || tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */ sq = parent_sq; tg = sq_to_tg(sq); goto again; } } } else { /* reached the top-level, queue issuing */ queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: spin_unlock_irq(&q->queue_lock); } /** * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * * This function is queued for execution when bios reach the bio_lists[] * of throtl_data->service_queue. Those bios are ready and issued by this * function. */ static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); struct throtl_service_queue *td_sq = &td->service_queue; struct request_queue *q = td->queue; struct bio_list bio_list_on_stack; struct bio *bio; struct blk_plug plug; int rw; bio_list_init(&bio_list_on_stack); spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(td_sq, NULL, rw))) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) submit_bio_noacct_nocheck(bio, false); blk_finish_plug(&plug); } } static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); u64 v = *(u64 *)((void *)tg + off); if (v == U64_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); unsigned int v = *(unsigned int *)((void *)tg + off); if (v == UINT_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static int tg_print_conf_u64(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static int tg_print_conf_uint(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its * ancestors has rules. This identifies groups without any * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ blkg_for_each_descendant_pre(blkg, pos_css, global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); tg_update_has_rules(this_tg); /* ignore root/second level */ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || !blkg->parent->parent) continue; } rcu_read_unlock(); /* * We're already holding queue_lock and know @tg is valid. Let's * apply the new config directly. * * Restart the slices for both READ and WRITES. It might happen * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ throtl_start_new_slice(tg, READ, false); throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } } static int blk_throtl_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct throtl_data *td; unsigned int memflags; int ret; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; td->queue = q; /* activate policy, blk_throtl_activated() will return true */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { q->td = NULL; kfree(td); goto out; } if (blk_queue_nonrot(q)) td->throtl_slice = DFL_THROTL_SLICE_SSD; else td->throtl_slice = DFL_THROTL_SLICE_HD; td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); out: blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue, memflags); return ret; } static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; int ret; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; if (!v) v = U64_MAX; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, true); } static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, false); } static int tg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_rwstat_recursive, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.io_service_bytes", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); u64 bps_dft; unsigned int iops_dft; if (!dname) return 0; bps_dft = U64_MAX; iops_dft = UINT_MAX; if (tg->bps[READ] == bps_dft && tg->bps[WRITE] == bps_dft && tg->iops[READ] == iops_dft && tg->iops[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); if (tg->bps[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else seq_printf(sf, " rbps=%llu", tg->bps[READ]); if (tg->bps[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); if (tg->iops[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else seq_printf(sf, " riops=%u", tg->iops[READ]); if (tg->iops[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else seq_printf(sf, " wiops=%u", tg->iops[WRITE]); seq_printf(sf, "\n"); return 0; } static int tg_print_limit(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; int ret; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); v[0] = tg->bps[READ]; v[1] = tg->bps[WRITE]; v[2] = tg->iops[READ]; v[3] = tg->iops[WRITE]; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; u64 val = U64_MAX; int len; if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) break; if (tok[0] == '\0') break; ctx.body += len; ret = -EINVAL; p = tok; strsep(&p, "="); if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) goto out_finish; ret = -ERANGE; if (!val) goto out_finish; ret = -EINVAL; if (!strcmp(tok, "rbps")) v[0] = val; else if (!strcmp(tok, "wbps")) v[1] = val; else if (!strcmp(tok, "riops")) v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); else goto out_finish; } tg->bps[READ] = v[0]; tg->bps[WRITE] = v[1]; tg->iops[READ] = v[2]; tg->iops[WRITE] = v[3]; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static struct cftype throtl_files[] = { { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = tg_print_limit, .write = tg_set_limit, }, { } /* terminate */ }; static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; cancel_work_sync(&td->dispatch_work); } static void tg_flush_bios(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; if (tg->flags & THROTL_TG_CANCELING) return; /* * Set the flag to make sure throtl_pending_timer_fn() won't * stop until all throttled bios are dispatched. */ tg->flags |= THROTL_TG_CANCELING; /* * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup * will be inserted to service queue without THROTL_TG_PENDING * set in tg_update_disptime below. Then IO dispatched from * child in tg_dispatch_one_bio will trigger double insertion * and corrupt the tree. */ if (!(tg->flags & THROTL_TG_PENDING)) return; /* * Update disptime after setting the above flag to make sure * throtl_select_dispatch() won't exit without dispatching. */ tg_update_disptime(tg); throtl_schedule_pending_timer(sq, jiffies + 1); } static void throtl_pd_offline(struct blkg_policy_data *pd) { tg_flush_bios(pd_to_tg(pd)); } struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; void blk_throtl_cancel_bios(struct gendisk *disk) { struct request_queue *q = disk->queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; if (!blk_throtl_activated(q)) return; spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. * However, rcu lock is still held to emphasize that following * path need RCU protection and to prevent warning from lockdep. */ rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { /* * disk_release will call pd_offline_fn to cancel bios. * However, disk_release can't be called if someone get * the refcount of device and issued bios which are * inflight after del_gendisk. * Cancel bios here to ensure no bios are inflight after * del_gendisk. */ tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); spin_unlock_irq(&q->queue_lock); } static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; /* * For a split bio, we need to specifically distinguish whether the * iops queue is empty. */ if (bio_flagged(bio, BIO_BPS_THROTTLED)) return sq->nr_queued_iops[rw] == 0 && tg_dispatch_iops_time(tg, bio) == 0; /* * Throtl is FIFO - if bios are already queued, should queue. * If the bps queue is empty and @bio is within the bps limit, charge * bps here for direct placement into the iops queue. */ if (sq_queued(&tg->service_queue, rw)) { if (sq->nr_queued_bps[rw] == 0 && tg_dispatch_bps_time(tg, bio) == 0) throtl_charge_bps_bio(tg, bio); return false; } return tg_dispatch_time(tg, bio) == 0; } bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; struct throtl_data *td = tg->td; rcu_read_lock(); spin_lock_irq(&q->queue_lock); sq = &tg->service_queue; while (true) { if (tg_within_limit(tg, bio, rw)) { /* within limits, let's charge and dispatch directly */ throtl_charge_iops_bio(tg, bio); /* * We need to trim slice even when bios are not being * queued otherwise it might happen that a bio is not * queued for a long time and slice keeps on extending * and trim is not called for a long time. Now if limits * are reduced suddenly we take into account all the IO * dispatched so far at new low rate and * newly queued * IO gets a really long dispatch time. * * So keep on trimming slice even if bio is not queued. */ throtl_trim_slice(tg, rw); } else if (bio_issue_as_root_blkg(bio)) { /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. * * Charge and dispatch directly, and our throttle * control algorithm is adaptive, and extra IO bytes * will be throttled for paying the debt */ throtl_charge_bps_bio(tg, bio); throtl_charge_iops_bio(tg, bio); } else { /* if above limits, break to queue */ break; } /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it * can be executed directly. */ qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); if (!tg) { bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; } } /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_iter.bi_size, tg_bps_limit(tg, rw), tg->io_disp[rw], tg_iops_limit(tg, rw), sq_queued(sq, READ), sq_queued(sq, WRITE)); td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; /* * Update @tg's dispatch time and force schedule dispatch if @tg * was empty before @bio, or the iops queue is empty and @bio will * add to. The forced scheduling isn't likely to cause undue * delay as @bio is likely to be dispatched directly if its @tg's * disptime is not in the future. */ if (tg->flags & THROTL_TG_WAS_EMPTY || tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { tg_update_disptime(tg); throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } out_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); return throttled; } void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; /* * blkg_destroy_all() already deactivate throtl policy, just check and * free throtl data. */ if (!q->td) return; timer_delete_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); kfree(q->td); } static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); if (!kthrotld_workqueue) panic("Failed to create kthrotld\n"); return blkcg_policy_register(&blkcg_policy_throtl); } module_init(throtl_init);
25 14 10 16 9 21 4 21 4 21 4 16 9 22 3 37 36 2 38 2 37 4 36 3 37 1 37 1 37 1 7 3 2 1 1 34 34 33 33 29 1 22 8 5 25 5 24 36 2 10 24 34 30 6 6 1 12 11 2 4 4 2 27 2 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/security.h> #include <linux/fscrypt.h> #include <linux/fileattr.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/namei.h> #include "internal.h" /** * fileattr_fill_xflags - initialize fileattr with xflags * @fa: fileattr pointer * @xflags: FS_XFLAG_* flags * * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All * other fields are zeroed. */ void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags) { memset(fa, 0, sizeof(*fa)); fa->fsx_valid = true; fa->fsx_xflags = xflags; if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE) fa->flags |= FS_IMMUTABLE_FL; if (fa->fsx_xflags & FS_XFLAG_APPEND) fa->flags |= FS_APPEND_FL; if (fa->fsx_xflags & FS_XFLAG_SYNC) fa->flags |= FS_SYNC_FL; if (fa->fsx_xflags & FS_XFLAG_NOATIME) fa->flags |= FS_NOATIME_FL; if (fa->fsx_xflags & FS_XFLAG_NODUMP) fa->flags |= FS_NODUMP_FL; if (fa->fsx_xflags & FS_XFLAG_DAX) fa->flags |= FS_DAX_FL; if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) fa->flags |= FS_PROJINHERIT_FL; } EXPORT_SYMBOL(fileattr_fill_xflags); /** * fileattr_fill_flags - initialize fileattr with flags * @fa: fileattr pointer * @flags: FS_*_FL flags * * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags). * All other fields are zeroed. */ void fileattr_fill_flags(struct file_kattr *fa, u32 flags) { memset(fa, 0, sizeof(*fa)); fa->flags_valid = true; fa->flags = flags; if (fa->flags & FS_SYNC_FL) fa->fsx_xflags |= FS_XFLAG_SYNC; if (fa->flags & FS_IMMUTABLE_FL) fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; if (fa->flags & FS_APPEND_FL) fa->fsx_xflags |= FS_XFLAG_APPEND; if (fa->flags & FS_NODUMP_FL) fa->fsx_xflags |= FS_XFLAG_NODUMP; if (fa->flags & FS_NOATIME_FL) fa->fsx_xflags |= FS_XFLAG_NOATIME; if (fa->flags & FS_DAX_FL) fa->fsx_xflags |= FS_XFLAG_DAX; if (fa->flags & FS_PROJINHERIT_FL) fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; } EXPORT_SYMBOL(fileattr_fill_flags); /** * vfs_fileattr_get - retrieve miscellaneous file attributes * @dentry: the object to retrieve from * @fa: fileattr pointer * * Call i_op->fileattr_get() callback, if exists. * * Return: 0 on success, or a negative error on failure. */ int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); int error; if (!inode->i_op->fileattr_get) return -EOPNOTSUPP; error = security_inode_file_getattr(dentry, fa); if (error) return error; return inode->i_op->fileattr_get(dentry, fa); } EXPORT_SYMBOL(vfs_fileattr_get); static void fileattr_to_file_attr(const struct file_kattr *fa, struct file_attr *fattr) { __u32 mask = FS_XFLAGS_MASK; memset(fattr, 0, sizeof(struct file_attr)); fattr->fa_xflags = fa->fsx_xflags & mask; fattr->fa_extsize = fa->fsx_extsize; fattr->fa_nextents = fa->fsx_nextents; fattr->fa_projid = fa->fsx_projid; fattr->fa_cowextsize = fa->fsx_cowextsize; } /** * copy_fsxattr_to_user - copy fsxattr to userspace. * @fa: fileattr pointer * @ufa: fsxattr user pointer * * Return: 0 on success, or -EFAULT on failure. */ int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa) { struct fsxattr xfa; __u32 mask = FS_XFLAGS_MASK; memset(&xfa, 0, sizeof(xfa)); xfa.fsx_xflags = fa->fsx_xflags & mask; xfa.fsx_extsize = fa->fsx_extsize; xfa.fsx_nextents = fa->fsx_nextents; xfa.fsx_projid = fa->fsx_projid; xfa.fsx_cowextsize = fa->fsx_cowextsize; if (copy_to_user(ufa, &xfa, sizeof(xfa))) return -EFAULT; return 0; } EXPORT_SYMBOL(copy_fsxattr_to_user); static int file_attr_to_fileattr(const struct file_attr *fattr, struct file_kattr *fa) { __u64 mask = FS_XFLAGS_MASK; if (fattr->fa_xflags & ~mask) return -EINVAL; fileattr_fill_xflags(fa, fattr->fa_xflags); fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK; fa->fsx_extsize = fattr->fa_extsize; fa->fsx_projid = fattr->fa_projid; fa->fsx_cowextsize = fattr->fa_cowextsize; return 0; } static int copy_fsxattr_from_user(struct file_kattr *fa, struct fsxattr __user *ufa) { struct fsxattr xfa; __u32 mask = FS_XFLAGS_MASK; if (copy_from_user(&xfa, ufa, sizeof(xfa))) return -EFAULT; if (xfa.fsx_xflags & ~mask) return -EOPNOTSUPP; fileattr_fill_xflags(fa, xfa.fsx_xflags); fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK; fa->fsx_extsize = xfa.fsx_extsize; fa->fsx_nextents = xfa.fsx_nextents; fa->fsx_projid = xfa.fsx_projid; fa->fsx_cowextsize = xfa.fsx_cowextsize; return 0; } /* * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject * any invalid configurations. * * Note: must be called with inode lock held. */ static int fileattr_set_prepare(struct inode *inode, const struct file_kattr *old_ma, struct file_kattr *fa) { int err; /* * The IMMUTABLE and APPEND_ONLY flags can only be changed by * the relevant capability. */ if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags); if (err) return err; /* * Project Quota ID state is only allowed to change from within the init * namespace. Enforce that restriction only if we are trying to change * the quota ID state. Everything else is allowed in user namespaces. */ if (current_user_ns() != &init_user_ns) { if (old_ma->fsx_projid != fa->fsx_projid) return -EINVAL; if ((old_ma->fsx_xflags ^ fa->fsx_xflags) & FS_XFLAG_PROJINHERIT) return -EINVAL; } else { /* * Caller is allowed to change the project ID. If it is being * changed, make sure that the new value is valid. */ if (old_ma->fsx_projid != fa->fsx_projid && !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid))) return -EINVAL; } /* Check extent size hints. */ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && !S_ISDIR(inode->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return -EINVAL; /* * It is only valid to set the DAX flag on regular files and * directories on filesystems. */ if ((fa->fsx_xflags & FS_XFLAG_DAX) && !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; /* Extent size hints of zero turn off the flags. */ if (fa->fsx_extsize == 0) fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); if (fa->fsx_cowextsize == 0) fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; return 0; } /** * vfs_fileattr_set - change miscellaneous file attributes * @idmap: idmap of the mount * @dentry: the object to change * @fa: fileattr pointer * * After verifying permissions, call i_op->fileattr_set() callback, if * exists. * * Verifying attributes involves retrieving current attributes with * i_op->fileattr_get(), this also allows initializing attributes that have * not been set by the caller to current values. Inode lock is held * thoughout to prevent racing with another instance. * * Return: 0 on success, or a negative error on failure. */ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct file_kattr old_ma = {}; int err; if (!inode->i_op->fileattr_set) return -EOPNOTSUPP; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; inode_lock(inode); err = vfs_fileattr_get(dentry, &old_ma); if (!err) { /* initialize missing bits from old_ma */ if (fa->flags_valid) { fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON; fa->fsx_extsize = old_ma.fsx_extsize; fa->fsx_nextents = old_ma.fsx_nextents; fa->fsx_projid = old_ma.fsx_projid; fa->fsx_cowextsize = old_ma.fsx_cowextsize; } else { fa->flags |= old_ma.flags & ~FS_COMMON_FL; } err = fileattr_set_prepare(inode, &old_ma, fa); if (err) goto out; err = security_inode_file_setattr(dentry, fa); if (err) goto out; err = inode->i_op->fileattr_set(idmap, dentry, fa); if (err) goto out; } out: inode_unlock(inode); return err; } EXPORT_SYMBOL(vfs_fileattr_set); int ioctl_getflags(struct file *file, unsigned int __user *argp) { struct file_kattr fa = { .flags_valid = true }; /* hint only */ int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); if (err == -EOPNOTSUPP) err = -ENOIOCTLCMD; if (!err) err = put_user(fa.flags, argp); return err; } EXPORT_SYMBOL(ioctl_getflags); int ioctl_setflags(struct file *file, unsigned int __user *argp) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct file_kattr fa; unsigned int flags; int err; err = get_user(flags, argp); if (!err) { err = mnt_want_write_file(file); if (!err) { fileattr_fill_flags(&fa, flags); err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); if (err == -EOPNOTSUPP) err = -ENOIOCTLCMD; } } return err; } EXPORT_SYMBOL(ioctl_setflags); int ioctl_fsgetxattr(struct file *file, void __user *argp) { struct file_kattr fa = { .fsx_valid = true }; /* hint only */ int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); if (err == -EOPNOTSUPP) err = -ENOIOCTLCMD; if (!err) err = copy_fsxattr_to_user(&fa, argp); return err; } EXPORT_SYMBOL(ioctl_fsgetxattr); int ioctl_fssetxattr(struct file *file, void __user *argp) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct file_kattr fa; int err; err = copy_fsxattr_from_user(&fa, argp); if (!err) { err = mnt_want_write_file(file); if (!err) { err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); if (err == -EOPNOTSUPP) err = -ENOIOCTLCMD; } } return err; } EXPORT_SYMBOL(ioctl_fssetxattr); SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, struct file_attr __user *, ufattr, size_t, usize, unsigned int, at_flags) { struct path filepath __free(path_put) = {}; struct filename *name __free(putname) = NULL; unsigned int lookup_flags = 0; struct file_attr fattr; struct file_kattr fa; int error; BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST); if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (usize > PAGE_SIZE) return -E2BIG; if (usize < FILE_ATTR_SIZE_VER0) return -EINVAL; name = getname_maybe_null(filename, at_flags); if (IS_ERR(name)) return PTR_ERR(name); if (!name && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; filepath = fd_file(f)->f_path; path_get(&filepath); } else { error = filename_lookup(dfd, name, lookup_flags, &filepath, NULL); if (error) return error; } error = vfs_fileattr_get(filepath.dentry, &fa); if (error) return error; fileattr_to_file_attr(&fa, &fattr); error = copy_struct_to_user(ufattr, usize, &fattr, sizeof(struct file_attr), NULL); return error; } SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename, struct file_attr __user *, ufattr, size_t, usize, unsigned int, at_flags) { struct path filepath __free(path_put) = {}; struct filename *name __free(putname) = NULL; unsigned int lookup_flags = 0; struct file_attr fattr; struct file_kattr fa; int error; BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST); if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (usize > PAGE_SIZE) return -E2BIG; if (usize < FILE_ATTR_SIZE_VER0) return -EINVAL; error = copy_struct_from_user(&fattr, sizeof(struct file_attr), ufattr, usize); if (error) return error; error = file_attr_to_fileattr(&fattr, &fa); if (error) return error; name = getname_maybe_null(filename, at_flags); if (IS_ERR(name)) return PTR_ERR(name); if (!name && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; filepath = fd_file(f)->f_path; path_get(&filepath); } else { error = filename_lookup(dfd, name, lookup_flags, &filepath, NULL); if (error) return error; } error = mnt_want_write(filepath.mnt); if (!error) { error = vfs_fileattr_set(mnt_idmap(filepath.mnt), filepath.dentry, &fa); mnt_drop_write(filepath.mnt); } return error; }
3 3 3 3 3 3 3 3 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/stats.c * * procfs-based user access to generic RPC statistics. The stats files * reside in /proc/net/rpc. * * The read routines assume that the buffer passed in is just big enough. * If you implement an RPC service that has its own stats routine which * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE * limit. * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/metrics.h> #include <linux/rcupdate.h> #include <trace/events/sunrpc.h> #include "netns.h" #define RPCDBG_FACILITY RPCDBG_MISC /* * Get RPC client stats */ static int rpc_proc_show(struct seq_file *seq, void *v) { const struct rpc_stat *statp = seq->private; const struct rpc_program *prog = statp->program; unsigned int i, j; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u\n", statp->rpccnt, statp->rpcretrans, statp->rpcauthrefresh); for (i = 0; i < prog->nrvers; i++) { const struct rpc_version *vers = prog->version[i]; if (!vers) continue; seq_printf(seq, "proc%u %u", vers->number, vers->nrprocs); for (j = 0; j < vers->nrprocs; j++) seq_printf(seq, " %u", vers->counts[j]); seq_putc(seq, '\n'); } return 0; } static int rpc_proc_open(struct inode *inode, struct file *file) { return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { .proc_open = rpc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, }; /* * Get RPC server stats */ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { const struct svc_program *prog = statp->program; const struct svc_version *vers; unsigned int i, j, k; unsigned long count; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u %u %u\n", statp->rpccnt, statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt, statp->rpcbadfmt, statp->rpcbadauth, statp->rpcbadclnt); for (i = 0; i < prog->pg_nvers; i++) { vers = prog->pg_vers[i]; if (!vers) continue; seq_printf(seq, "proc%d %u", i, vers->vs_nproc); for (j = 0; j < vers->vs_nproc; j++) { count = 0; for_each_possible_cpu(k) count += per_cpu(vers->vs_count[j], k); seq_printf(seq, " %lu", count); } seq_putc(seq, '\n'); } } EXPORT_SYMBOL_GPL(svc_seq_show); /** * rpc_alloc_iostats - allocate an rpc_iostats structure * @clnt: RPC program, version, and xprt * */ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { struct rpc_iostats *stats; int i; stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL); if (stats) { for (i = 0; i < clnt->cl_maxproc; i++) spin_lock_init(&stats[i].om_lock); } return stats; } EXPORT_SYMBOL_GPL(rpc_alloc_iostats); /** * rpc_free_iostats - release an rpc_iostats structure * @stats: doomed rpc_iostats structure * */ void rpc_free_iostats(struct rpc_iostats *stats) { kfree(stats); } EXPORT_SYMBOL_GPL(rpc_free_iostats); /** * rpc_count_iostats_metrics - tally up per-task stats * @task: completed rpc_task * @op_metrics: stat structure for OP that will accumulate stats from @task */ void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; ktime_t backlog, execute, now; if (!op_metrics || !req) return; now = ktime_get(); spin_lock(&op_metrics->om_lock); op_metrics->om_ops++; /* kernel API: om_ops must never become larger than om_ntrans */ op_metrics->om_ntrans += max(req->rq_ntrans, 1); op_metrics->om_timeouts += task->tk_timeouts; op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; backlog = 0; if (ktime_to_ns(req->rq_xtime)) { backlog = ktime_sub(req->rq_xtime, task->tk_start); op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog); } op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); execute = ktime_sub(now, task->tk_start); op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); if (task->tk_status < 0) op_metrics->om_error_status++; spin_unlock(&op_metrics->om_lock); trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute); } EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); /** * rpc_count_iostats - tally up per-task stats * @task: completed rpc_task * @stats: array of stat structures * * Uses the statidx from @task */ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) { rpc_count_iostats_metrics(task, &stats[task->tk_msg.rpc_proc->p_statidx]); } EXPORT_SYMBOL_GPL(rpc_count_iostats); static void _print_name(struct seq_file *seq, unsigned int op, const struct rpc_procinfo *procs) { if (procs[op].p_name) seq_printf(seq, "\t%12s: ", procs[op].p_name); else if (op == 0) seq_printf(seq, "\t NULL: "); else seq_printf(seq, "\t%12u: ", op); } static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b) { a->om_ops += b->om_ops; a->om_ntrans += b->om_ntrans; a->om_timeouts += b->om_timeouts; a->om_bytes_sent += b->om_bytes_sent; a->om_bytes_recv += b->om_bytes_recv; a->om_queue = ktime_add(a->om_queue, b->om_queue); a->om_rtt = ktime_add(a->om_rtt, b->om_rtt); a->om_execute = ktime_add(a->om_execute, b->om_execute); a->om_error_status += b->om_error_status; } static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats, int op, const struct rpc_procinfo *procs) { _print_name(seq, op, procs); seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n", stats->om_ops, stats->om_ntrans, stats->om_timeouts, stats->om_bytes_sent, stats->om_bytes_recv, ktime_to_ms(stats->om_queue), ktime_to_ms(stats->om_rtt), ktime_to_ms(stats->om_execute), stats->om_error_status); } static int do_print_stats(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *seqv) { struct seq_file *seq = seqv; xprt->ops->print_stats(xprt, seq); return 0; } void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt) { unsigned int op, maxproc = clnt->cl_maxproc; if (!clnt->cl_metrics) return; seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS); seq_printf(seq, "p/v: %u/%u (%s)\n", clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name); rpc_clnt_iterate_for_each_xprt(clnt, do_print_stats, seq); seq_printf(seq, "\tper-op statistics\n"); for (op = 0; op < maxproc; op++) { struct rpc_iostats stats = {}; struct rpc_clnt *next = clnt; do { _add_rpc_iostats(&stats, &next->cl_metrics[op]); if (next == next->cl_parent) break; next = next->cl_parent; } while (next); _print_rpc_iostats(seq, &stats, op, clnt->cl_procinfo); } } EXPORT_SYMBOL_GPL(rpc_clnt_show_stats); /* * Register/unregister RPC proc files */ static inline struct proc_dir_entry * do_register(struct net *net, const char *name, void *data, const struct proc_ops *proc_ops) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc/%s\n", name); sn = net_generic(net, sunrpc_net_id); return proc_create_data(name, 0, sn->proc_net_rpc, proc_ops, data); } struct proc_dir_entry * rpc_proc_register(struct net *net, struct rpc_stat *statp) { return do_register(net, statp->program->name, statp, &rpc_proc_ops); } EXPORT_SYMBOL_GPL(rpc_proc_register); void rpc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); void svc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(svc_proc_unregister); int rpc_proc_init(struct net *net) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc\n"); sn = net_generic(net, sunrpc_net_id); sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net); if (sn->proc_net_rpc == NULL) return -ENOMEM; return 0; } void rpc_proc_exit(struct net *net) { dprintk("RPC: unregistering /proc/net/rpc\n"); remove_proc_entry("rpc", net->proc_net); }
1 1 1 1 1 235 208 30 1 31 6 6 6 6 6 6 25 44 44 44 5 33 33 16 17 33 33 6 28 4 2 2 2 1 18 18 6 7 30 30 21 25 41 1 34 34 34 34 17 17 17 23 7 3 2 30 30 9 21 21 17 26 17 11 10 11 3 11 2 3 25 9 6 6 6 6 66 33 33 6 23 7 3 2 9 9 6 37 21 59 38 38 6 32 3 16 16 16 3 13 6 2 4 35 31 4 7 7 7 1 1 6 7 5 3 5 1 62 56 6 6 37 14 44 2 5 39 10 2 14 2 4 30 34 5 2 2 38 6 3 18 3 13 3 1 2 2 65 22 47 65 65 65 64 256 257 157 158 106 153 15 7 258 8 2 1 2 3 5 5 1 5 311 311 311 213 112 311 1 8 8 7 311 189 13 84 44 311 65 123 184 77 10 255 255 255 255 255 255 255 123 176 11 83 64 1 4 147 7 141 2 6 1 1 209 83 34 97 270 17 77 209 130 154 273 271 2 269 22 52 54 136 5 1 75 194 260 257 258 258 118 25 116 213 20 9 216 145 300 300 34 34 34 34 34 34 34 34 34 21 13 34 65 65 40 19 59 59 45 1 2 2 41 32 9 8 31 3 4 32 36 4 32 26 2 8 10 10 4 4 36 3 3 34 150 1 39 1 5 1 3 2 1 1 1 1 1 1 2 3 1 5 1 2 2 1 1 2 2 14 1 12 12 1 1 65 1 59 1 1 1 2 1 2 1 5 1 1 4 1265 1264 1263 61 12 25 20 1262 1262 54 49 1 2 2 13 13 2 2 11 4 5 9 9 9 4 5 9 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> #include <linux/percpu.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif #include <linux/bpf.h> #include <net/compat.h> #include <linux/netfilter_netdev.h> #include "internal.h" /* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. On transmit: ------------ dev_has_header(dev) == true mac_header -> ll header data -> ll header dev_has_header(dev) == false (ll header is invisible to us) mac_header -> data data -> data We should set network_header on output to the correct position, packet classifier depends on it. */ /* Private packet socket structures. */ /* identical to struct packet_mreq except it has * a longer address field. */ struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; union tpacket_uhdr { struct tpacket_hdr *h1; struct tpacket2_hdr *h2; struct tpacket3_hdr *h3; void *raw; }; static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); static int prb_curr_blk_in_use(struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room. */ unsigned int origlen; struct sockaddr_ll ll; }; } sa; }; #define vio_le() virtio_legacy_is_little_endian() #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) #define GET_PBLOCK_DESC(x, bid) \ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) #define GET_NEXT_PRB_BLK_NUM(x) \ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ ((x)->kactive_blk_num+1) : 0) static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); #ifdef CONFIG_NETFILTER_EGRESS static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) { struct sk_buff *next, *head = NULL, *tail; int rc; rcu_read_lock(); for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); if (!nf_hook_egress(skb, &rc, skb->dev)) continue; if (!head) head = skb; else tail->next = skb; tail = skb; } rcu_read_unlock(); return head; } #endif static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) return dev_queue_xmit(skb); #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); if (!skb) return NET_XMIT_DROP; } #endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(po->cached_dev); dev_hold(dev); rcu_read_unlock(); return dev; } static void packet_cached_dev_assign(struct packet_sock *po, struct net_device *dev) { rcu_assign_pointer(po->cached_dev, dev); } static void packet_cached_dev_reset(struct packet_sock *po) { RCU_INIT_POINTER(po->cached_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; int cpu = raw_smp_processor_id(); u16 queue_index; #ifdef CONFIG_XPS skb->sender_cpu = cpu + 1; #endif skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } static void register_prot_hook(struct sock *sk) { lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); __register_prot_hook(sk); } /* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this. */ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); lockdep_assert_held_once(&po->bind_lock); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); else __dev_remove_pack(&po->prot_hook); __sock_put(sk); if (sync) { spin_unlock(&po->bind_lock); synchronize_net(); spin_lock(&po->bind_lock); } } static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } static inline struct page * __pure pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } smp_wmb(); } static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); return 0; } } static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; } static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0; h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds. */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2: h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); return ts_status; } static void *packet_lookup_frame(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; h.raw = rb->pg_vec[pg_vec_pos].buffer + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; return h.raw; } static void *packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { return packet_lookup_frame(po, rb, rb->head, status); } static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { struct vlan_hdr vhdr, *vh; unsigned int header_len; if (!dev) return 0; /* In the SOCK_DGRAM scenario, skb data starts at the network * protocol, which is after the VLAN headers. The outer VLAN * header is at the hard_header_len offset in non-variable * length link layer headers. If it's a VLAN device, the * min_header_len should be used to exclude the VLAN header * size. */ if (dev->min_header_len == dev->hard_header_len) header_len = dev->hard_header_len; else if (is_vlan_dev(dev)) header_len = dev->min_header_len; else return 0; vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; return ntohs(vh->h_vlan_TCI); } static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; if (unlikely(eth_type_vlan(proto))) proto = __vlan_get_protocol_offset(skb, proto, skb_mac_offset(skb), NULL); return proto; } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); hrtimer_cancel(&pkc->retire_blk_timer); } static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) { rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV; /* If the link speed is so slow you don't really * need to worry about perf anyways */ if (ecmd.base.speed < SPEED_1000 || ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV; div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; if (div) return mbits + 1; return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, union tpacket_req_u *req_u) { p1->feature_req_word = req_u->req3.tp_feature_req_word; } static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; memset(p1, 0x0, sizeof(*p1)); p1->knxt_seq_num = 1; p1->pkbdq = pg_vec; pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; p1->pkblk_start = pg_vec[0].buffer; p1->kblk_size = req_u->req3.tp_block_size; p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); else p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, req_u->req3.tp_block_size)); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, HRTIMER_MODE_REL_SOFT); prb_open_block(p1, pbd); } /* * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. */ static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) { struct packet_sock *po = timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; spin_lock(&po->sk.sk_receive_queue.lock); frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. * */ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } if (!frozen) { if (BLOCK_NUM_PKTS(pbd)) { /* Not an empty block. Need retire the block. */ prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); prb_dispatch_next_block(pkc, po); } } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ if (!prb_curr_blk_in_use(pbd)) { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); } } hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); spin_unlock(&po->sk.sk_receive_queue.lock); return HRTIMER_RESTART; } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, __u32 status) { /* Flush everything minus the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 u8 *start, *end; start = (u8 *)pbd1; /* Skip the block header(we know header WILL fit in 4K) */ start += PAGE_SIZE; end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); for (; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif /* Now update the block status. */ BLOCK_STATUS(pbd1) = status; /* Flush the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 start = (u8 *)pbd1; flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif } /* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened. */ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsigned int stat) { __u32 status = TP_STATUS_USER | stat; struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt->tp_next_offset = 0; /* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) { h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ struct timespec64 ts; ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } smp_wmb(); /* Flush the block */ prb_flush_block(pkc1, pbd1, status); sk->sk_data_ready(sk); pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) { pkc->reset_pending_on_curr_blk = 0; } /* * prb_open_block is called by tpacket_rcv or timer callback. * * Reasons why NOT update hrtimer in prb_open_block: * 1) It will increase complexity to distinguish the two caller scenario. * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update * TMO of an already enqueued hrtimer, leading to complex shutdown logic. * * One side effect of NOT update hrtimer when called by tpacket_rcv is that * a newly opened block triggered by tpacket_rcv may be retired earlier than * expected. On the other hand, if timeout is updated in prb_open_block, the * frequent reception of network packets that leads to prb_open_block being * called may cause hrtimer to be removed and enqueued repeatedly. */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); /* We could have just memset this but we will lose the * flexibility of making the priv area sticky */ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; pbd1->version = pkc1->version; pkc1->prev = pkc1->nxt_offset; pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); smp_wmb(); } /* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used. */ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) /* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value. */ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { struct tpacket_block_desc *pbd; smp_rmb(); /* 1. Get current block num */ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { prb_freeze_queue(pkc, po); return NULL; } /* * 3. * open this block and return the offset where the first packet * needs to get stored. */ prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset; } static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po, unsigned int status) { struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; } } static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) { return pkc->reset_pending_on_curr_blk; } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = 0; } static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc); if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) { ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) prb_fill_rxhash(pkc, ppd); else prb_clear_rxhash(pkc, ppd); } static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; ppd = (struct tpacket3_hdr *)curr; ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); pkc->prev = curr; pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, unsigned int len ) { struct tpacket_kbdq_core *pkc; struct tpacket_block_desc *pbd; char *curr, *end; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL; } else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect. */ prb_open_block(pkc, pbd); } } smp_mb(); curr = pkc->nxt_offset; pkc->skb = skb; end = (char *)pbd + pkc->kblk_size; /* first try the current block */ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* Ok, close the current block */ prb_retire_current_block(pkc, po, 0); /* Now, try to dispatch the next block */ curr = (char *)prb_dispatch_next_block(pkc, po); if (curr) { pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* * No free blocks are available.user_space hasn't caught up yet. * Queue was just frozen and now this packet will get dropped. */ return NULL; } static void *packet_current_rx_frame(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len) { char *curr = NULL; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: curr = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, status); return curr; case TPACKET_V3: return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); return NULL; } } static void *prb_lookup_block(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; return pbd; } static int prb_previous_blk_num(struct packet_ring_buffer *rb) { unsigned int prev; if (rb->prb_bdqc.kactive_blk_num) prev = rb->prb_bdqc.kactive_blk_num-1; else prev = rb->prb_bdqc.knum_blocks-1; return prev; } /* Assumes caller has held the rx_queue.lock */ static void *__prb_previous_block(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = prb_previous_blk_num(rb); return prb_lookup_block(po, rb, previous, status); } static void *packet_previous_rx_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { if (po->tp_version <= TPACKET_V2) return packet_previous_frame(po, rb, status); return __prb_previous_block(po, rb, status); } static void packet_increment_rx_head(struct packet_sock *po, struct packet_ring_buffer *rb) { switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: return packet_increment_head(rb); case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return; } } static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; return packet_lookup_frame(po, rb, previous, status); } static void packet_increment_head(struct packet_ring_buffer *buff) { buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } static void packet_inc_pending(struct packet_ring_buffer *rb) { this_cpu_inc(*rb->pending_refcnt); } static void packet_dec_pending(struct packet_ring_buffer *rb) { this_cpu_dec(*rb->pending_refcnt); } static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) { unsigned int refcnt = 0; int cpu; /* We don't use pending refcount in rx_ring. */ if (rb->pending_refcnt == NULL) return 0; for_each_possible_cpu(cpu) refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); return refcnt; } static int packet_alloc_pending(struct packet_sock *po) { po->rx_ring.pending_refcnt = NULL; po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); if (unlikely(po->tx_ring.pending_refcnt == NULL)) return -ENOBUFS; return 0; } static void packet_free_pending(struct packet_sock *po) { free_percpu(po->tx_ring.pending_refcnt); } #define ROOM_POW_OFF 2 #define ROOM_NONE 0x0 #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.frame_max) + 1; idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static int __packet_rcv_has_room(const struct packet_sock *po, const struct sk_buff *skb) { const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int rcvbuf = READ_ONCE(sk->sk_rcvbuf); int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) - (skb ? skb->truesize : 0); if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; else return ROOM_NONE; } if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_v3_has_room(po, 0)) ret = ROOM_LOW; } else { if (__tpacket_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } return ret; } static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { bool pressure; int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive packet socket: %p\n", sk); return; } } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { u32 *history = po->rollover->history; u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) if (READ_ONCE(history[i]) == rxhash) count++; victim = get_random_u32_below(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) WRITE_ONCE(history[victim], rxhash); return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_scale(__skb_get_hash_symmetric(skb), num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { unsigned int val = atomic_inc_return(&f->rr_cur); return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return smp_processor_id() % num; } static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return get_random_u32_below(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, bool try_self, unsigned int num) { struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); if (room == ROOM_NORMAL || (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) return idx; po_skip = po; } i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); if (po_next != po_skip && !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; atomic_long_inc(&po->rollover->num); if (room == ROOM_LOW) atomic_long_inc(&po->rollover->num_huge); return i; } if (++i == num) i = 0; } while (i != j); atomic_long_inc(&po->rollover->num_failed); return idx; } static unsigned int fanout_demux_qm(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return skb_get_queue_mapping(skb) % num; } static unsigned int fanout_demux_bpf(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { struct bpf_prog *prog; unsigned int ret = 0; rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; } static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } switch (f->type) { case PACKET_FANOUT_HASH: default: idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; case PACKET_FANOUT_QM: idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; spin_lock(&f->lock); rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) dev_add_pack(&f->prot_hook); spin_unlock(&f->lock); } static void __fanout_unlink(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; int i; spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { if (rcu_dereference_protected(f->arr[i], lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); rcu_assign_pointer(f->arr[i], rcu_dereference_protected(f->arr[f->num_members - 1], lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); spin_unlock(&f->lock); } static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { if (sk->sk_family != PF_PACKET) return false; return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_LB: atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } } static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) { struct bpf_prog *old; spin_lock(&f->lock); old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); rcu_assign_pointer(f->bpf_prog, new); spin_unlock(&f->lock); if (old) { synchronize_net(); bpf_prog_destroy(old); } } static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; struct sock_fprog fprog; int ret; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; ret = copy_bpf_fprog_from_user(&fprog, data, len); if (ret) return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; u32 fd; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; if (len != sizeof(fd)) return -EINVAL; if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); case PACKET_FANOUT_EBPF: return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; } } static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); } } static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) { struct packet_fanout *f; list_for_each_entry(f, &fanout_list, list) { if (f->id == candidate_id && read_pnet(&f->net) == sock_net(sk)) { return false; } } return true; } static bool fanout_find_new_id(struct sock *sk, u16 *new_id) { u16 id = fanout_next_id; do { if (__fanout_id_is_free(sk, id)) { *new_id = id; fanout_next_id = id + 1; return true; } id++; } while (id != fanout_next_id); return false; } static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; u16 id = args->id; int err; switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL; } mutex_lock(&fanout_mutex); err = -EALREADY; if (po->fanout) goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { err = -ENOMEM; rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); if (!rollover) goto out; atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) { err = -EINVAL; goto out; } if (!fanout_find_new_id(sk, &id)) { err = -ENOMEM; goto out; } /* ephemeral flag for the first socket in the group: drop it */ flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); } match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && read_pnet(&f->net) == sock_net(sk)) { match = f; break; } } err = -EINVAL; if (match) { if (match->flags != flags) goto out; if (args->max_num_members && args->max_num_members != match->max_num_members) goto out; } else { if (args->max_num_members > PACKET_FANOUT_MAX) goto out; if (!args->max_num_members) /* legacy PACKET_FANOUT_MAX */ args->max_num_members = 256; err = -ENOMEM; match = kvzalloc(struct_size(match, arr, args->max_num_members), GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; match->flags = flags; INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); refcount_set(&match->sk_ref, 0); fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; spin_lock(&po->bind_lock); if (po->num && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __dev_remove_pack(&po->prot_hook); __fanout_link(sk, po); } err = 0; } } spin_unlock(&po->bind_lock); if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); kvfree(match); } out: kfree(rollover); mutex_unlock(&fanout_mutex); return err; } /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net()) */ static struct packet_fanout *fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; mutex_lock(&fanout_mutex); f = po->fanout; if (f) { po->fanout = NULL; if (refcount_dec_and_test(&f->sk_ref)) list_del(&f->list); else f = NULL; } mutex_unlock(&fanout_mutex); return f; } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, struct sk_buff *skb) { /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices. */ if (unlikely(dev->type != ARPHRD_ETHER)) return false; skb_reset_mac_header(skb); return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); } static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; /* * When we registered the protocol we saved the socket in the data * field for just this event. */ sk = pt->af_packet_priv; /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if (!net_eq(dev_net(dev), sock_net(sk))) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom; /* drop any routing info */ skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) return 0; out: kfree_skb(skb); oom: return 0; } static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { int depth; if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); } /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } /* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame */ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; /* * Get and verify the address. */ if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) proto = saddr->spkt_protocol; } else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; retry: rcu_read_lock(); dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) goto out_unlock; if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level. */ skb_reserve(skb, reserved); skb_reset_network_header(skb); /* Try to align data part correctly */ if (hhlen) { skb->data -= hhlen; skb->tail -= hhlen; if (len < hhlen) skb_reset_network_header(skb); } err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry; } if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_unlock; } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } skb->protocol = proto; skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); dev_queue_xmit(skb); rcu_read_unlock(); return len; out_unlock: rcu_read_unlock(); out_free: kfree_skb(skb); return err; } static unsigned int run_filter(struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, size_t *len, int vnet_hdr_sz) { struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) goto drop_n_acct; if (skb_head != skb->data) { skb->data = skb_head; skb->len = skb_len; } consume_skb(skb); skb = nskb; } sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; sll->sll_halen = dev_parse_header(skb, sll->sll_addr); /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length. */ PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); return 0; drop_n_acct: atomic_inc(&po->tp_drops); sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, hdrlen; unsigned int netoff; struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; unsigned int slot_id = 0; int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...). */ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; /* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + po->tp_reserve; } else { unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); if (vnet_hdr_sz) netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (READ_ONCE(po->copy_thresh) && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { copy_skb = skb_get(skb); skb_head = skb->data; } if (copy_skb) { memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { u32 nval; nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", snaplen, nval, macoff); snaplen = nval; if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; if (po->tp_version <= TPACKET_V2) { slot_id = po->rx_ring.head; if (test_bit(slot_id, po->rx_ring.rx_owner_map)) goto drop_n_account; __set_bit(slot_id, po->rx_ring.rx_owner_map); } if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { if (po->tp_version == TPACKET_V3) prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); /* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture. */ ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp) | SOF_TIMESTAMPING_SOFTWARE); if (!ts_status) ktime_get_real_ts64(&ts); status |= ts_status; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: h.h2->tp_len = skb->len; h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (skb_vlan_tag_present(skb)) { h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev); h.h2->tp_vlan_tpid = ntohs(skb->protocol); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; } memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: /* tp_nxt_offset,vlan are already populated above. * So DONT clear those fields here */ h.h3->tp_status |= status; h.h3->tp_len = skb->len; h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: BUG(); } sll = h.raw + TPACKET_ALIGN(hdrlen); sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; smp_mb(); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 if (po->tp_version <= TPACKET_V2) { u8 *start, *end; end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + macoff + snaplen); for (start = h.raw; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); } smp_wmb(); #endif if (po->tp_version <= TPACKET_V2) { spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); sk_skb_reason_drop(sk, copy_skb, drop_reason); goto drop_n_restore; } static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); if (likely(po->tx_ring.pg_vec)) { void *ph; __u32 ts; ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); complete(&po->skb_completion); } sock_wfree(skb); } static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) { if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) return -EINVAL; return 0; } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { int ret; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; ret = __packet_snd_vnet_parse(vnet_hdr, *len); if (ret) return ret; /* move iter to point to the start of mac header */ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, __be16 proto, unsigned char *addr, int hlen, int copylen, const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; int err; ph.raw = frame; skb->protocol = proto; skb->dev = dev; skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); to_write = tp_len; if (sock->type == SOCK_DGRAM) { err = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { int hdrlen = min_t(int, copylen, tp_len); skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; if (!dev_validate_header(dev, skb->data, hdrlen)) return -EINVAL; data += hdrlen; to_write -= hdrlen; } offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); skb->data_len = to_write; skb->len += to_write; skb->truesize += to_write; refcount_add(to_write, &po->sk.sk_wmem_alloc); while (likely(to_write)) { nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { pr_err("Packet exceed the number of skb frags(%u)\n", (unsigned int)MAX_SKB_FRAGS); return -EFAULT; } page = pgv_to_page(data); data += len; flush_dcache_page(page); get_page(page); skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; len = ((to_write > len_max) ? len_max : to_write); } packet_parse_headers(skb, sock); return tp_len; } static int tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data) { union tpacket_uhdr ph; int tp_len, off; ph.raw = frame; switch (po->tp_version) { case TPACKET_V3: if (ph.h3->tp_next_offset != 0) { pr_warn_once("variable sized slot not supported"); return -EINVAL; } tp_len = ph.h3->tp_len; break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; default: tp_len = ph.h1->tp_len; break; } if (unlikely(tp_len > size_max)) { pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE; } if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_net; break; case TPACKET_V2: off = ph.h2->tp_net; break; default: off = ph.h1->tp_net; break; } } else { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_mac; break; case TPACKET_V2: off = ph.h2->tp_mac; break; default: off = ph.h1->tp_mac; break; } } if (unlikely((off < off_min) || (off_max < off))) return -EINVAL; } else { off = po->tp_hdrlen - sizeof(struct sockaddr_ll); } *data = frame + off; return tp_len; } static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb = NULL; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; long timeo; mutex_lock(&po->pg_vec_lock); /* packet_sendmsg() check on tx_ring.pg_vec was lockless, * we need to confirm it under protection of pg_vec_lock. */ if (unlikely(!po->tx_ring.pg_vec)) { err = -EBUSY; goto out; } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (po->sk.sk_socket->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_put; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put; sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) goto out_put; } if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { /* Note: packet_read_pending() might be slow if we * have to call it as it's per_cpu variable, but in * fast-path we don't have to call it, only when ph * is NULL, we need to check the pending_refcnt. */ if (need_wait && packet_read_pending(&po->tx_ring)) { timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } /* check for additional frames */ continue; } else break; } skb = NULL; tp_len = tpacket_parse_header(po, ph, size_max, &data); if (tp_len < 0) goto tpacket_error; status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; if (vnet_hdr_sz) { vnet_hdr = data; data += vnet_hdr_sz; tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; goto tpacket_error; } copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { /* we assume the socket was initially writeable ... */ if (likely(len_sum > 0)) err = len_sum; goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { tpacket_error: if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); kfree_skb(skb); continue; } else { status = TP_STATUS_WRONG_FORMAT; err = tp_len; goto out_status; } } if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; } virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ skb = NULL; goto out_status; } /* * skb was dropped but not destructed yet; * let's treat it like congestion or err < 0 */ err = 0; } packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (1); err = len_sum; goto out_put; out_status: __packet_set_status(po, ph, status); kfree_skb(skb); out_put: dev_put(dev); out: mutex_unlock(&po->pg_vec_lock); return err; } static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, size_t reserve, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, reserve); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); struct sk_buff *skb; struct net_device *dev; __be16 proto; unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; /* * Get and verify the address. */ if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (sock->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_unlock; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out_unlock; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (vnet_hdr_sz) { err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); linear = max(linear, min_t(int, len, dev->hard_header_len)); skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; skb_reset_network_header(skb); err = -EINVAL; if (sock->type == SOCK_DGRAM) { offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); if (len < reserve + sizeof(struct ipv6hdr) && dev->min_header_len != dev->hard_header_len) skb_reset_network_header(skb); } /* Returns -EFAULT on error */ err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); if (err) goto out_free; if ((sock->type == SOCK_RAW && !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } skb_setup_tx_timestamp(skb, &sockc); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; } skb->protocol = proto; skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err) goto out_unlock; } dev_put(dev); return len; out_free: kfree_skb(skb); out_unlock: dev_put(dev); out: return err; } static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. * tpacket_snd() will redo the check safely. */ if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); return packet_snd(sock, msg, len); } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; struct packet_fanout *f; struct net *net; union tpacket_req_u req_u; if (!sk) return 0; net = sock_net(sk); po = pkt_sk(sk); mutex_lock(&net->packet.sklist_lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); packet_cached_dev_reset(po); if (po->prot_hook.dev) { netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); packet_flush_mclist(sk); lock_sock(sk); if (po->rx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); } if (po->tx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); } release_sock(sk); f = fanout_release(sk); synchronize_net(); kfree(po->rollover); if (f) { fanout_release_data(f); kvfree(f); } /* * Now the socket is dead. No more input will appear. */ sock_orphan(sk); sock->sk = NULL; /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev = NULL; bool unlisted = false; bool need_rehook; int ret = 0; lock_sock(sk); spin_lock(&po->bind_lock); if (!proto) proto = po->num; rcu_read_lock(); if (po->fanout) { ret = -EINVAL; goto out_unlock; } if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { ret = -ENODEV; goto out_unlock; } } else if (ifindex) { dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { ret = -ENODEV; goto out_unlock; } } need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev; if (need_rehook) { dev_hold(dev); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() */ WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); if (dev) unlisted = !dev_get_by_index_rcu(sock_net(sk), dev->ifindex); } BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); if (unlikely(unlisted)) { po->prot_hook.dev = NULL; WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { netdev_hold(dev, &po->prot_hook.dev_tracker, GFP_ATOMIC); po->prot_hook.dev = dev; WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } dev_put(dev); } if (proto == 0 || !need_rehook) goto out_unlock; if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } out_unlock: rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); return ret; } /* * Bind a packet socket to a device */ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; char name[sizeof(uaddr->sa_data_min) + 1]; /* * Check legality */ if (addr_len != sizeof(struct sockaddr)) return -EINVAL; /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; /* * Check legality */ if (addr_len < sizeof(struct sockaddr_ll)) return -EINVAL; if (sll->sll_family != AF_PACKET) return -EINVAL; return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; /* * Create a packet of type SOCK_PACKET. */ static int packet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; po = pkt_sk(sk); err = packet_alloc_pending(po); if (err) goto out_sk_free; sock_init_data(sock, sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; /* * Attach a protocol block */ spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); sk_add_node_tail_rcu(sk, &net->packet.sklist); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, &packet_proto, 1); return 0; out_sk_free: sk_free(sk); out: return err; } /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. */ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) goto out; #if 0 /* What error should we return now? EUNATTACH? */ if (pkt_sk(sk)->ifindex < 0) return -ENODEV; #endif if (flags & MSG_ERRQUEUE) { err = sock_recv_errqueue(sk, msg, len, SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } /* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block. */ skb = skb_recv_datagram(sk, flags, &err); /* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries. */ if (skb == NULL) goto out; packet_rcv_try_clear_pressure(pkt_sk(sk)); if (vnet_hdr_len) { err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; } /* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway. */ copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; if (sock->type != SOCK_PACKET) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; sll->sll_protocol = (sock->type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { const size_t max_len = min(sizeof(skb->cb), sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); copy_len = msg->msg_namelen; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { memset(msg->msg_name + offsetof(struct sockaddr_ll, sll_addr), 0, sizeof(sll->sll_addr)); msg->msg_namelen = sizeof(struct sockaddr_ll); } } if (WARN_ON_ONCE(copy_len > max_len)) { copy_len = max_len; msg->msg_namelen = copy_len; } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); if (skb_vlan_tag_present(skb)) { aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex); if (dev) { aux.tp_vlan_tci = vlan_get_tci(skb, dev); aux.tp_vlan_tpid = ntohs(skb->protocol); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } rcu_read_unlock(); } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } /* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); out: return err; } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; if (peer) return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock(); return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); int ifindex; if (peer) return -EOPNOTSUPP; ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; /* Let __fortify_memcpy_chk() know the actual buffer size. */ memcpy(((struct sockaddr_storage *)sll)->__data + offsetof(struct sockaddr_ll, sll_addr) - offsetofend(struct sockaddr_ll, sll_family), dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; } rcu_read_unlock(); return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) { switch (i->type) { case PACKET_MR_MULTICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_mc_add(dev, i->addr); else return dev_mc_del(dev, i->addr); break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_uc_add(dev, i->addr); else return dev_uc_del(dev, i->addr); break; default: break; } return 0; } static void packet_dev_mclist_delete(struct net_device *dev, struct packet_mclist **mlp, struct list_head *list) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { list_add(&ml->remove_list, list); *mlp = ml->next; } else mlp = &ml->next; } } static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml, *i; struct net_device *dev; int err; rtnl_lock(); err = -ENODEV; dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); if (!dev) goto done; err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; err = -ENOBUFS; i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; err = 0; for (ml = po->mclist; ml; ml = ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; /* Free the new element ... */ kfree(i); goto done; } } i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; INIT_LIST_HEAD(&i->remove_list); i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); if (err) { po->mclist = i->next; kfree(i); } done: rtnl_unlock(); return err; } static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_mclist *ml, **mlp; rtnl_lock(); for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev) packet_dev_mc(dev, ml, -1); kfree(ml); } break; } } rtnl_unlock(); return 0; } static void packet_flush_mclist(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml; if (!po->mclist) return; rtnl_lock(); while ((ml = po->mclist) != NULL) { struct net_device *dev; po->mclist = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev != NULL) packet_dev_mc(dev, ml, -1); kfree(ml); } rtnl_unlock(); } static int packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); int ret; if (level != SOL_PACKET) return -ENOPROTOOPT; switch (optname) { case PACKET_ADD_MEMBERSHIP: case PACKET_DROP_MEMBERSHIP: { struct packet_mreq_max mreq; int len = optlen; memset(&mreq, 0, sizeof(mreq)); if (len < sizeof(struct packet_mreq)) return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; if (optname == PACKET_ADD_MEMBERSHIP) ret = packet_mc_add(sk, &mreq); else ret = packet_mc_drop(sk, &mreq); return ret; } case PACKET_RX_RING: case PACKET_TX_RING: { union tpacket_req_u req_u; ret = -EINVAL; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: if (optlen < sizeof(req_u.req)) break; ret = copy_from_sockptr(&req_u.req, optval, sizeof(req_u.req)) ? -EINVAL : 0; break; case TPACKET_V3: default: if (optlen < sizeof(req_u.req3)) break; ret = copy_from_sockptr(&req_u.req3, optval, sizeof(req_u.req3)) ? -EINVAL : 0; break; } if (!ret) ret = packet_set_ring(sk, &req_u, 0, optname == PACKET_TX_RING); release_sock(sk); return ret; } case PACKET_COPY_THRESH: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(pkt_sk(sk)->copy_thresh, val); return 0; } case PACKET_VERSION: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: break; default: return -EINVAL; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_version = val; ret = 0; } release_sock(sk); return ret; } case PACKET_RESERVE: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_reserve = val; ret = 0; } release_sock(sk); return ret; } case PACKET_LOSS: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val); ret = 0; } release_sock(sk); return ret; } case PACKET_AUXDATA: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: case PACKET_VNET_HDR_SZ: { int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (optname == PACKET_VNET_HDR_SZ) { if (val && val != sizeof(struct virtio_net_hdr) && val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) return -EINVAL; hdr_len = val; } else { hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { WRITE_ONCE(po->vnet_hdr_sz, hdr_len); ret = 0; } release_sock(sk); return ret; } case PACKET_TIMESTAMP: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(po->tp_tstamp, val); return 0; } case PACKET_FANOUT: { struct fanout_args args = { 0 }; if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { /* Paired with the WRITE_ONCE() in fanout_add() */ if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); } case PACKET_IGNORE_OUTGOING: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val); release_sock(sk); return 0; } case PACKET_QDISC_BYPASS: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val); return 0; } default: return -ENOPROTOOPT; } } static int packet_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { int len; int val, lv = sizeof(val); struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case PACKET_STATISTICS: spin_lock_bh(&sk->sk_receive_queue.lock); memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); st.stats3.tp_drops = drops; st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); st.stats1.tp_drops = drops; st.stats1.tp_packets += drops; data = &st.stats1; } break; case PACKET_AUXDATA: val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: val = !!READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VNET_HDR_SZ: val = READ_ONCE(po->vnet_hdr_sz); break; case PACKET_COPY_THRESH: val = READ_ONCE(pkt_sk(sk)->copy_thresh); break; case PACKET_VERSION: val = po->tp_version; break; case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); if (len < sizeof(int)) return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { case TPACKET_V1: val = sizeof(struct tpacket_hdr); break; case TPACKET_V2: val = sizeof(struct tpacket2_hdr); break; case TPACKET_V3: val = sizeof(struct tpacket3_hdr); break; default: return -EINVAL; } break; case PACKET_RESERVE: val = po->tp_reserve; break; case PACKET_LOSS: val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP: val = READ_ONCE(po->tp_tstamp); break; case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | ((u32)po->fanout->type << 16) | ((u32)po->fanout->flags << 24)) : 0); break; case PACKET_IGNORE_OUTGOING: val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; rstats.tp_all = atomic_long_read(&po->rollover->num); rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); data = &rstats; lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT; } if (len > lv) len = lv; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, data, len)) return -EFAULT; return 0; } static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct packet_mclist *ml, *tmp; LIST_HEAD(mclist); struct sock *sk; rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) packet_dev_mclist_delete(dev, &po->mclist, &mclist); fallthrough; case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); } break; case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break; } } rcu_read_unlock(); /* packet_dev_mc might grab instance locks so can't run under rcu */ list_for_each_entry_safe(ml, tmp, &mclist, remove_list) { packet_dev_mc(dev, ml, -1); kfree(ml); } return NOTIFY_DONE; } static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: case SIOCDARP: case SIOCGARP: case SIOCSARP: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: return inet_dgram_ops.ioctl(sock, cmd, arg); #endif default: return -ENOIOCTLCMD; } return 0; } static __poll_t packet_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { if (!packet_previous_rx_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) mask |= EPOLLOUT | EPOLLWRNORM; } spin_unlock_bh(&sk->sk_write_queue.lock); return mask; } /* Dirty? Well, I still did not learn better way to account * for user mmaps. */ static void packet_mm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { .open = packet_mm_open, .close = packet_mm_close, }; static void free_pg_vec(struct pgv *pg_vec, unsigned int order, unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { if (is_vmalloc_addr(pg_vec[i].buffer)) vfree(pg_vec[i].buffer); else free_pages((unsigned long)pg_vec[i].buffer, order); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } static char *alloc_one_pg_vec_page(unsigned long order) { char *buffer; gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* __get_free_pages failed, fall back to vmalloc */ buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); if (buffer) return buffer; /* vmalloc failed, lets dig into swap here */ gfp_flags &= ~__GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* complete and utter failure */ return NULL; } static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; struct pgv *pg_vec; int i; pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!pg_vec)) goto out; for (i = 0; i < block_nr; i++) { pg_vec[i].buffer = alloc_one_pg_vec_page(order); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } out: return pg_vec; out_free_pgvec: free_pg_vec(pg_vec, order, block_nr); pg_vec = NULL; goto out; } static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring) { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; err = -EBUSY; if (!closing) { if (atomic_long_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; } if (req->tp_block_nr) { unsigned int min_frame_size; /* Sanity tests and some calculations */ err = -EBUSY; if (unlikely(rb->pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V1: po->tp_hdrlen = TPACKET_HDRLEN; break; case TPACKET_V2: po->tp_hdrlen = TPACKET2_HDRLEN; break; case TPACKET_V3: po->tp_hdrlen = TPACKET3_HDRLEN; break; } err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; min_frame_size = po->tp_hdrlen + po->tp_reserve; if (po->tp_version >= TPACKET_V3 && req->tp_block_size < BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size) goto out; if (unlikely(req->tp_frame_size < min_frame_size)) goto out; if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; rb->frames_per_block = req->tp_block_size / req->tp_frame_size; if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) goto out; err = -ENOMEM; order = get_order(req->tp_block_size); pg_vec = alloc_pg_vec(req, order); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V3: /* Block transmit is not supported yet */ if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); } else { struct tpacket_req3 *req3 = &req_u->req3; if (req3->tp_retire_blk_tov || req3->tp_sizeof_priv || req3->tp_feature_req_word) { err = -EINVAL; goto out_free_pg_vec; } } break; default: if (!tx_ring) { rx_owner_map = bitmap_alloc(req->tp_frame_nr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); if (!rx_owner_map) goto out_free_pg_vec; } break; } } /* Done */ else { err = -EINVAL; if (unlikely(req->tp_frame_nr)) goto out; } /* Detach socket from network */ spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; WRITE_ONCE(po->num, 0); if (was_running) __unregister_prot_hook(sk, false); spin_unlock(&po->bind_lock); synchronize_net(); err = -EBUSY; mutex_lock(&po->pg_vec_lock); if (closing || atomic_long_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); if (po->tp_version <= TPACKET_V2) swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); if (atomic_long_read(&po->mapped)) pr_err("packet_mmap: vma is busy: %ld\n", atomic_long_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); WRITE_ONCE(po->num, num); if (was_running) register_prot_hook(sk); spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); } out_free_pg_vec: if (pg_vec) { bitmap_free(rx_owner_map); free_pg_vec(pg_vec, order, req->tp_block_nr); } out: return err; } static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned long size, expected_size; struct packet_ring_buffer *rb; unsigned long start; int err = -EINVAL; int i; if (vma->vm_pgoff) return -EINVAL; mutex_lock(&po->pg_vec_lock); expected_size = 0; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec) { expected_size += rb->pg_vec_len * rb->pg_vec_pages * PAGE_SIZE; } } if (expected_size == 0) goto out; size = vma->vm_end - vma->vm_start; if (size != expected_size) goto out; start = vma->vm_start; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec == NULL) continue; for (i = 0; i < rb->pg_vec_len; i++) { struct page *page; void *kaddr = rb->pg_vec[i].buffer; int pg_num; for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { page = pgv_to_page(kaddr); err = vm_insert_page(vma, start, page); if (unlikely(err)) goto out; start += PAGE_SIZE; kaddr += PAGE_SIZE; } } } atomic_long_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; out: mutex_unlock(&po->pg_vec_lock); return err; } static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind_spkt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, }; static const struct net_proto_family packet_family_ops = { .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static struct notifier_block packet_netdev_notifier = { .notifier_call = packet_notifier, }; #ifdef CONFIG_PROC_FS static void *packet_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); rcu_read_lock(); return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); return seq_hlist_next_rcu(v, &net->packet.sklist, pos); } static void packet_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%*sRefCnt Type Proto Iface R Rmem User Inode\n", IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); seq_printf(seq, "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sk_uid(s)), sock_i_ino(s)); } return 0; } static const struct seq_operations packet_seq_ops = { .start = packet_seq_start, .next = packet_seq_next, .stop = packet_seq_stop, .show = packet_seq_show, }; #endif static int __net_init packet_net_init(struct net *net) { mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); #ifdef CONFIG_PROC_FS if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); } static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, }; static void __exit packet_exit(void) { sock_unregister(PF_PACKET); proto_unregister(&packet_proto); unregister_netdevice_notifier(&packet_netdev_notifier); unregister_pernet_subsys(&packet_net_ops); } static int __init packet_init(void) { int rc; rc = register_pernet_subsys(&packet_net_ops); if (rc) goto out; rc = register_netdevice_notifier(&packet_netdev_notifier); if (rc) goto out_pernet; rc = proto_register(&packet_proto, 0); if (rc) goto out_notifier; rc = sock_register(&packet_family_ops); if (rc) goto out_proto; return 0; out_proto: proto_unregister(&packet_proto); out_notifier: unregister_netdevice_notifier(&packet_netdev_notifier); out_pernet: unregister_pernet_subsys(&packet_net_ops); out: return rc; } module_init(packet_init); module_exit(packet_exit); MODULE_DESCRIPTION("Packet socket support (AF_PACKET)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET);
456 456 155 456 456 456 548 515 12 4 4 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_DST_METADATA_H #define __NET_DST_METADATA_H 1 #include <linux/skbuff.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/macsec.h> #include <net/dst.h> enum metadata_type { METADATA_IP_TUNNEL, METADATA_HW_PORT_MUX, METADATA_MACSEC, METADATA_XFRM, }; struct hw_port_info { struct net_device *lower_dev; u32 port_id; }; struct macsec_info { sci_t sci; }; struct xfrm_md_info { u32 if_id; int link; struct dst_entry *dst_orig; }; struct metadata_dst { struct dst_entry dst; enum metadata_type type; union { struct ip_tunnel_info tun_info; struct hw_port_info port_info; struct macsec_info macsec_info; struct xfrm_md_info xfrm_info; } u; }; static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb) { struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); if (md_dst && md_dst->dst.flags & DST_METADATA) return md_dst; return NULL; } static inline struct ip_tunnel_info * skb_tunnel_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_IP_TUNNEL) return &md_dst->u.tun_info; dst = skb_dst(skb); if (dst && dst->lwtstate && (dst->lwtstate->type == LWTUNNEL_ENCAP_IP || dst->lwtstate->type == LWTUNNEL_ENCAP_IP6)) return lwt_tun_info(dst->lwtstate); return NULL; } static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt) { return (struct xfrm_md_info *)lwt->data; } static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_XFRM) return &md_dst->u.xfrm_info; dst = skb_dst(skb); if (dst && dst->lwtstate && dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM) return lwt_xfrm_info(dst->lwtstate); return NULL; } static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); return dst && !(dst->flags & DST_METADATA); } static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, const struct sk_buff *skb_b) { const struct metadata_dst *a, *b; if (!(skb_a->_skb_refdst | skb_b->_skb_refdst)) return 0; a = (const struct metadata_dst *) skb_dst(skb_a); b = (const struct metadata_dst *) skb_dst(skb_b); if (!a != !b || a->type != b->type) return 1; switch (a->type) { case METADATA_HW_PORT_MUX: return memcmp(&a->u.port_info, &b->u.port_info, sizeof(a->u.port_info)); case METADATA_IP_TUNNEL: return memcmp(&a->u.tun_info, &b->u.tun_info, sizeof(a->u.tun_info) + a->u.tun_info.options_len); case METADATA_MACSEC: return memcmp(&a->u.macsec_info, &b->u.macsec_info, sizeof(a->u.macsec_info)); case METADATA_XFRM: return memcmp(&a->u.xfrm_info, &b->u.xfrm_info, sizeof(a->u.xfrm_info)); default: return 1; } } void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); static inline struct metadata_dst *tun_rx_dst(int md_size) { struct metadata_dst *tun_dst; tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!tun_dst) return NULL; tun_dst->u.tun_info.options_len = 0; tun_dst->u.tun_info.mode = 0; return tun_dst; } static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); int md_size; struct metadata_dst *new_md; if (!md_dst || md_dst->type != METADATA_IP_TUNNEL) return ERR_PTR(-EINVAL); md_size = md_dst->u.tun_info.options_len; new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC); if (!new_md) return ERR_PTR(-ENOMEM); memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size); #ifdef CONFIG_DST_CACHE /* Unclone the dst cache if there is one */ if (new_md->u.tun_info.dst_cache.cache) { int ret; ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); if (ret) { metadata_dst_free(new_md); return ERR_PTR(ret); } } #endif skb_dst_drop(skb); skb_dst_set(skb, &new_md->dst); return new_md; } static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb) { struct metadata_dst *dst; dst = tun_dst_unclone(skb); if (IS_ERR(dst)) return NULL; return &dst->u.tun_info; } static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl, __be16 tp_dst, const unsigned long *flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; ip_tunnel_key_init(&tun_dst->u.tun_info.key, saddr, daddr, tos, ttl, 0, 0, tp_dst, tunnel_id, flags); return tun_dst; } static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, const unsigned long *flags, __be64 tunnel_id, int md_size) { const struct iphdr *iph = ip_hdr(skb); struct metadata_dst *tun_dst; tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, 0, flags, tunnel_id, md_size); if (tun_dst && (iph->frag_off & htons(IP_DF))) __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_dst->u.tun_info.key.tun_flags); return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 tos, __u8 ttl, __be16 tp_dst, __be32 label, const unsigned long *flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; info->mode = IP_TUNNEL_INFO_IPV6; ip_tunnel_flags_copy(info->key.tun_flags, flags); info->key.tun_id = tunnel_id; info->key.tp_src = 0; info->key.tp_dst = tp_dst; info->key.u.ipv6.src = *saddr; info->key.u.ipv6.dst = *daddr; info->key.tos = tos; info->key.ttl = ttl; info->key.label = label; return tun_dst; } static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, const unsigned long *flags, __be64 tunnel_id, int md_size) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr, ipv6_get_dsfield(ip6h), ip6h->hop_limit, 0, ip6_flowlabel(ip6h), flags, tunnel_id, md_size); } #endif /* __NET_DST_METADATA_H */
51 1 61 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 /* SPDX-License-Identifier: GPL-2.0 */ /* * Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __NET_MPTCP_H #define __NET_MPTCP_H #include <linux/skbuff.h> #include <linux/tcp.h> #include <linux/types.h> struct mptcp_info; struct mptcp_sock; struct mptcp_pm_addr_entry; struct seq_file; /* MPTCP sk_buff extension data */ struct mptcp_ext { union { u64 data_ack; u32 data_ack32; }; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; u8 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, frozen:1, reset_transient:1; u8 reset_reason:4, csum_reqd:1, infinite_map:1; }; #define MPTCPOPT_HMAC_LEN 20 #define MPTCP_RM_IDS_MAX 8 struct mptcp_rm_list { u8 ids[MPTCP_RM_IDS_MAX]; u8 nr; }; struct mptcp_addr_info { u8 id; sa_family_t family; __be16 port; union { struct in_addr addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct in6_addr addr6; #endif }; }; struct mptcp_out_options { #if IS_ENABLED(CONFIG_MPTCP) u16 suboptions; struct mptcp_rm_list rm_list; u8 join_id; u8 backup; u8 reset_reason:4, reset_transient:1, csum_reqd:1, allow_join_id0:1; union { struct { u64 sndr_key; u64 rcvr_key; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; }; struct { struct mptcp_addr_info addr; u64 ahmac; }; struct { struct mptcp_ext ext_copy; u64 fail_seq; }; struct { u32 nonce; u32 token; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; }; }; #endif }; #define MPTCP_SCHED_NAME_MAX 16 #define MPTCP_SCHED_MAX 128 #define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) struct mptcp_sched_ops { int (*get_send)(struct mptcp_sock *msk); int (*get_retrans)(struct mptcp_sock *msk); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; struct list_head list; void (*init)(struct mptcp_sock *msk); void (*release)(struct mptcp_sock *msk); } ____cacheline_aligned_in_smp; #define MPTCP_PM_NAME_MAX 16 #define MPTCP_PM_MAX 128 #define MPTCP_PM_BUF_MAX (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX) struct mptcp_pm_ops { char name[MPTCP_PM_NAME_MAX]; struct module *owner; struct list_head list; void (*init)(struct mptcp_sock *msk); void (*release)(struct mptcp_sock *msk); } ____cacheline_aligned_in_smp; #ifdef CONFIG_MPTCP void mptcp_init(void); static inline bool sk_is_mptcp(const struct sock *sk) { return tcp_sk(sk)->is_mptcp; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp; } static inline bool rsk_drop_req(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req; } void mptcp_space(const struct sock *ssk, int *space, int *full_space); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts); void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info); /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ static inline void mptcp_skb_ext_move(struct sk_buff *to, struct sk_buff *from) { if (!skb_ext_exist(from, SKB_EXT_MPTCP)) return; if (WARN_ON_ONCE(to->active_extensions)) skb_ext_put(to); to->active_extensions = from->active_extensions; to->extensions = from->extensions; from->active_extensions = 0; } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { struct mptcp_ext *from_ext; from_ext = skb_ext_find(from, SKB_EXT_MPTCP); if (!from_ext) return; from_ext->frozen = 1; skb_ext_copy(to, from); } static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, const struct mptcp_ext *from_ext) { /* MPTCP always clears the ext when adding it to the skb, so * holes do not bother us here */ return !from_ext || (to_ext && from_ext && !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); } /* check if skbs can be collapsed. * MPTCP collapse is allowed if neither @to or @from carry an mptcp data * mapping, or if the extension of @to is the same as @from. * Collapsing is not possible if @to lacks an extension, but @from carries one. */ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), skb_ext_find(from, SKB_EXT_MPTCP)); } void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener); __be32 mptcp_get_reset_option(const struct sk_buff *skb); static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { if (skb_ext_exist(skb, SKB_EXT_MPTCP)) return mptcp_get_reset_option(skb); return htonl(0u); } void mptcp_active_detect_blackhole(struct sock *sk, bool expired); #else static inline void mptcp_init(void) { } static inline bool sk_is_mptcp(const struct sock *sk) { return false; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return false; } static inline bool rsk_drop_req(const struct request_sock *req) { return false; } static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { return true; } static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { } static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return true; } static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { return 0; /* TCP fallback */ } static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { return NULL; } static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } static inline void mptcp_active_detect_blackhole(struct sock *sk, bool expired) { } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcpv6_init(void); void mptcpv6_handle_mapped(struct sock *sk, bool mapped); #elif IS_ENABLED(CONFIG_IPV6) static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif #if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL) struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk); #else static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; } #endif #if !IS_ENABLED(CONFIG_MPTCP) struct mptcp_sock { }; #endif #endif /* __NET_MPTCP_H */
37 22 15 25 11 1 4 2 2 4 33 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> struct nft_last { unsigned long jiffies; unsigned int set; }; struct nft_last_priv { struct nft_last *last; }; static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = { [NFTA_LAST_SET] = { .type = NLA_U32 }, [NFTA_LAST_MSECS] = { .type = NLA_U64 }, }; static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_last_priv *priv = nft_expr_priv(expr); struct nft_last *last; u64 last_jiffies; int err; last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT); if (!last) return -ENOMEM; if (tb[NFTA_LAST_SET]) last->set = ntohl(nla_get_be32(tb[NFTA_LAST_SET])); if (last->set && tb[NFTA_LAST_MSECS]) { err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies); if (err < 0) goto err; last->jiffies = jiffies - (unsigned long)last_jiffies; } priv->last = last; return 0; err: kfree(last); return err; } static void nft_last_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_last_priv *priv = nft_expr_priv(expr); struct nft_last *last = priv->last; if (READ_ONCE(last->jiffies) != jiffies) WRITE_ONCE(last->jiffies, jiffies); if (READ_ONCE(last->set) == 0) WRITE_ONCE(last->set, 1); } static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_last_priv *priv = nft_expr_priv(expr); struct nft_last *last = priv->last; unsigned long last_jiffies = READ_ONCE(last->jiffies); u32 last_set = READ_ONCE(last->set); __be64 msecs; if (time_before(jiffies, last_jiffies)) { WRITE_ONCE(last->set, 0); last_set = 0; } if (last_set) msecs = nf_jiffies64_to_msecs(jiffies - last_jiffies); else msecs = 0; if (nla_put_be32(skb, NFTA_LAST_SET, htonl(last_set)) || nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_last_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_last_priv *priv = nft_expr_priv(expr); kfree(priv->last); } static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_last_priv *priv_dst = nft_expr_priv(dst); struct nft_last_priv *priv_src = nft_expr_priv(src); priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp); if (!priv_dst->last) return -ENOMEM; priv_dst->last->set = priv_src->last->set; priv_dst->last->jiffies = priv_src->last->jiffies; return 0; } static const struct nft_expr_ops nft_last_ops = { .type = &nft_last_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)), .eval = nft_last_eval, .init = nft_last_init, .destroy = nft_last_destroy, .clone = nft_last_clone, .dump = nft_last_dump, .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_last_type __read_mostly = { .name = "last", .ops = &nft_last_ops, .policy = nft_last_policy, .maxattr = NFTA_LAST_MAX, .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, };
12 3 6 17 17 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 // SPDX-License-Identifier: GPL-2.0 /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. */ #include <linux/log2.h> #include <linux/slab.h> #include <linux/overflow.h> #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include "xsk_queue.h" static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; struct xdp_rxtx_ring *rxtx_ring; if (umem_queue) return struct_size(umem_ring, desc, q->nentries); return struct_size(rxtx_ring, desc, q->nentries); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) { struct xsk_queue *q; size_t size; q = kzalloc(sizeof(*q), GFP_KERNEL); if (!q) return NULL; q->nentries = nentries; q->ring_mask = nentries - 1; size = xskq_get_ring_size(q, umem_queue); /* size which is overflowing or close to SIZE_MAX will become 0 in * PAGE_ALIGN(), checking SIZE_MAX is enough due to the previous * is_power_of_2(), the rest will be handled by vmalloc_user() */ if (unlikely(size == SIZE_MAX)) { kfree(q); return NULL; } size = PAGE_ALIGN(size); q->ring = vmalloc_user(size); if (!q->ring) { kfree(q); return NULL; } q->ring_vmalloc_size = size; return q; } void xskq_destroy(struct xsk_queue *q) { if (!q) return; vfree(q->ring); kfree(q); }
20 17 9 9 13 257 257 45 303 302 301 11 11 11 11 11 11 11 11 302 302 11 14 290 291 290 14 14 14 14 14 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/cache.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/pid_namespace.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/completion.h> #include <linux/poll.h> #include <linux/printk.h> #include <linux/file.h> #include <linux/limits.h> #include <linux/init.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/bug.h> #include "internal.h" static void proc_evict_inode(struct inode *inode) { struct ctl_table_header *head; struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ if (ei->pid) proc_pid_evict_inode(ei); head = ei->sysctl; if (head) { WRITE_ONCE(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } static struct kmem_cache *proc_inode_cachep __ro_after_init; static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { struct proc_inode *ei; ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; return &ei->vfs_inode; } static void proc_free_inode(struct inode *inode) { struct proc_inode *ei = PROC_I(inode); if (ei->pid) put_pid(ei->pid); /* Let go of any associated proc directory entry */ if (ei->pde) pde_put(ei->pde); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; inode_init_once(&ei->vfs_inode); } void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT| SLAB_PANIC), init_once); pde_opener_cache = kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); proc_dir_entry_cache = kmem_cache_create_usercopy( "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC, offsetof(struct proc_dir_entry, inline_name), SIZEOF_PDE_INLINE_NAME, NULL); BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) { struct hlist_node *node; struct super_block *old_sb = NULL; rcu_read_lock(); while ((node = hlist_first_rcu(inodes))) { struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes); struct super_block *sb; struct inode *inode; spin_lock(lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(lock); inode = &ei->vfs_inode; sb = inode->i_sb; if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) continue; inode = igrab(inode); rcu_read_unlock(); if (sb != old_sb) { if (old_sb) deactivate_super(old_sb); old_sb = sb; } if (unlikely(!inode)) { rcu_read_lock(); continue; } if (S_ISDIR(inode->i_mode)) { struct dentry *dir = d_find_any_alias(inode); if (dir) { d_invalidate(dir); dput(dir); } } else { struct dentry *dentry; while ((dentry = d_find_alias(inode))) { d_invalidate(dentry); dput(dentry); } } iput(inode); rcu_read_lock(); } rcu_read_unlock(); if (old_sb) deactivate_super(old_sb); } static inline const char *hidepid2str(enum proc_hidepid v) { switch (v) { case HIDEPID_OFF: return "off"; case HIDEPID_NO_ACCESS: return "noaccess"; case HIDEPID_INVISIBLE: return "invisible"; case HIDEPID_NOT_PTRACEABLE: return "ptraceable"; } WARN_ONCE(1, "bad hide_pid value: %d\n", v); return "unknown"; } static int proc_show_options(struct seq_file *seq, struct dentry *root) { struct proc_fs_info *fs_info = proc_sb_info(root->d_sb); if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); if (fs_info->hide_pid != HIDEPID_OFF) seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid)); if (fs_info->pidonly != PROC_PIDONLY_OFF) seq_printf(seq, ",subset=pid"); return 0; } const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, }; enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } /* * At most 2 contexts can enter this function: the one doing the last * close on the descriptor and whoever is deleting PDE itself. * * First to enter calls ->proc_release hook and signals its completion * to the second one which waits and then does nothing. * * PDE is locked on entry, unlocked on exit. */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: * ->release hook needs to be available at the right moment. * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ DECLARE_COMPLETION_ONSTACK(c); pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); } else { struct file *file; struct completion *c; pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; pde->proc_ops->proc_release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); c = pdeo->c; spin_unlock(&pde->pde_unload_lock); if (unlikely(c)) complete(c); kmem_cache_free(pde_opener_cache, pdeo); } } void proc_entry_rundown(struct proc_dir_entry *de) { DECLARE_COMPLETION_ONSTACK(c); /* Wait until all existing callers into module are done. */ de->pde_unload_completion = &c; if (atomic_add_return(BIAS, &de->in_use) != BIAS) wait_for_completion(&c); /* ->pde_openers list can't grow from now on. */ spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (pde_is_permanent(pde)) { return pde->proc_ops->proc_lseek(file, offset, whence); } else if (use_pde(pde)) { rv = pde->proc_ops->proc_lseek(file, offset, whence); unuse_pde(pde); } return rv; } static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp)); ssize_t ret; if (pde_is_permanent(pde)) return pde->proc_ops->proc_read_iter(iocb, iter); if (!use_pde(pde)) return -EIO; ret = pde->proc_ops->proc_read_iter(iocb, iter); unuse_pde(pde); return ret; } static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) { __auto_type read = pde->proc_ops->proc_read; if (read) return read(file, buf, count, ppos); return -EIO; } static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde_is_permanent(pde)) { return pde_read(pde, file, buf, count, ppos); } else if (use_pde(pde)) { rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) { __auto_type write = pde->proc_ops->proc_write; if (write) return write(file, buf, count, ppos); return -EIO; } static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (pde_is_permanent(pde)) { return pde_write(pde, file, buf, count, ppos); } else if (use_pde(pde)) { rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) { __auto_type poll = pde->proc_ops->proc_poll; if (poll) return poll(file, pts); return DEFAULT_POLLMASK; } static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; if (pde_is_permanent(pde)) { return pde_poll(pde, file, pts); } else if (use_pde(pde)) { rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { __auto_type ioctl = pde->proc_ops->proc_ioctl; if (ioctl) return ioctl(file, cmd, arg); return -ENOTTY; } static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (pde_is_permanent(pde)) { return pde_ioctl(pde, file, cmd, arg); } else if (use_pde(pde)) { rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { __auto_type compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) return compat_ioctl(file, cmd, arg); return -ENOTTY; } static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (pde_is_permanent(pde)) { return pde_compat_ioctl(pde, file, cmd, arg); } else if (use_pde(pde)) { rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) { __auto_type mmap = pde->proc_ops->proc_mmap; if (mmap) return mmap(file, vma); return -EIO; } static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; if (pde_is_permanent(pde)) { return pde_mmap(pde, file, vma); } else if (use_pde(pde)) { rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (pde->proc_ops->proc_get_unmapped_area) return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); #endif return orig_addr; } static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; if (pde_is_permanent(pde)) { return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); } else if (use_pde(pde)) { rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; } static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; struct pde_opener *pdeo; if (!pde_has_proc_lseek(pde)) file->f_mode &= ~FMODE_LSEEK; if (pde_is_permanent(pde)) { open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); return rv; } /* * Ensure that * 1) PDE's ->release hook will be called no matter what * either normally by close()/->release, or forcefully by * rmmod/remove_proc_entry. * * 2) rmmod isn't blocked by opening file in /proc and sitting on * the descriptor (including "rmmod foo </proc/foo" scenario). * * Save every "struct file" with custom ->release hook. */ if (!use_pde(pde)) return -ENOENT; __auto_type release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { rv = -ENOMEM; goto out_unuse; } } open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); if (release) { if (rv == 0) { /* To know what to release. */ pdeo->file = file; pdeo->closing = false; pdeo->c = NULL; spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); } else kmem_cache_free(pde_opener_cache, pdeo); } out_unuse: unuse_pde(pde); return rv; } static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; if (pde_is_permanent(pde)) { __auto_type release = pde->proc_ops->proc_release; if (release) { return release(inode, file); } return 0; } spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); return 0; } } spin_unlock(&pde->pde_unload_lock); return 0; } static const struct file_operations proc_reg_file_ops = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, .splice_read = copy_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #ifdef CONFIG_COMPAT static const struct file_operations proc_reg_file_ops_compat = { .llseek = proc_reg_llseek, .read = proc_reg_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .splice_read = copy_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .compat_ioctl = proc_reg_compat_ioctl, .mmap = proc_reg_mmap, .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #endif static void proc_put_link(void *p) { unuse_pde(p); } static const char *proc_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; } const struct inode_operations proc_link_inode_operations = { .get_link = proc_get_link, }; struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode *inode = new_inode(sb); if (!inode) { pde_put(de); return NULL; } inode->i_private = de->data; inode->i_ino = de->low_ino; simple_inode_init_ts(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); return inode; } if (de->mode) { inode->i_mode = de->mode; inode->i_uid = de->uid; inode->i_gid = de->gid; } if (de->size) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); if (S_ISREG(inode->i_mode)) { inode->i_op = de->proc_iops; if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops; else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT if (pde_has_proc_compat_ioctl(de)) { if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops_compat; else inode->i_fop = &proc_reg_file_ops_compat; } #endif } else if (S_ISDIR(inode->i_mode)) { inode->i_op = de->proc_iops; inode->i_fop = de->proc_dir_ops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = de->proc_iops; inode->i_fop = NULL; } else { BUG(); } return inode; }
4 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* * xt_time * Copyright © CC Computer Consultants GmbH, 2007 * * based on ipt_time by Fabrice MARIE <fabrice@netfilter.org> * This is a module which is used for time matching * It is using some modified code from dietlibc (localtime() function) * that you can find at https://www.fefe.de/dietlibc/ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ktime.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_time.h> struct xtm { u_int8_t month; /* (1-12) */ u_int8_t monthday; /* (1-31) */ u_int8_t weekday; /* (1-7) */ u_int8_t hour; /* (0-23) */ u_int8_t minute; /* (0-59) */ u_int8_t second; /* (0-59) */ unsigned int dse; }; extern struct timezone sys_tz; /* ouch */ static const u_int16_t days_since_year[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, }; static const u_int16_t days_since_leapyear[] = { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, }; /* * Since time progresses forward, it is best to organize this array in reverse, * to minimize lookup time. */ enum { DSE_FIRST = 2039, SECONDS_PER_DAY = 86400, }; static const u_int16_t days_since_epoch[] = { /* 2039 - 2030 */ 25202, 24837, 24472, 24106, 23741, 23376, 23011, 22645, 22280, 21915, /* 2029 - 2020 */ 21550, 21184, 20819, 20454, 20089, 19723, 19358, 18993, 18628, 18262, /* 2019 - 2010 */ 17897, 17532, 17167, 16801, 16436, 16071, 15706, 15340, 14975, 14610, /* 2009 - 2000 */ 14245, 13879, 13514, 13149, 12784, 12418, 12053, 11688, 11323, 10957, /* 1999 - 1990 */ 10592, 10227, 9862, 9496, 9131, 8766, 8401, 8035, 7670, 7305, /* 1989 - 1980 */ 6940, 6574, 6209, 5844, 5479, 5113, 4748, 4383, 4018, 3652, /* 1979 - 1970 */ 3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0, }; static inline bool is_leap(unsigned int y) { return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0); } /* * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp. * Since we match against days and daytime, the SSTE value needs to be * computed back into human-readable dates. * * This is done in three separate functions so that the most expensive * calculations are done last, in case a "simple match" can be found earlier. */ static inline unsigned int localtime_1(struct xtm *r, time64_t time) { unsigned int v, w; /* Each day has 86400s, so finding the hour/minute is actually easy. */ div_u64_rem(time, SECONDS_PER_DAY, &v); r->second = v % 60; w = v / 60; r->minute = w % 60; r->hour = w / 60; return v; } static inline void localtime_2(struct xtm *r, time64_t time) { /* * Here comes the rest (weekday, monthday). First, divide the SSTE * by seconds-per-day to get the number of _days_ since the epoch. */ r->dse = div_u64(time, SECONDS_PER_DAY); /* * 1970-01-01 (w=0) was a Thursday (4). * -1 and +1 map Sunday properly onto 7. */ r->weekday = (4 + r->dse - 1) % 7 + 1; } static void localtime_3(struct xtm *r, time64_t time) { unsigned int year, i, w = r->dse; /* * In each year, a certain number of days-since-the-epoch have passed. * Find the year that is closest to said days. * * Consider, for example, w=21612 (2029-03-04). Loop will abort on * dse[i] <= w, which happens when dse[i] == 21550. This implies * year == 2009. w will then be 62. */ for (i = 0, year = DSE_FIRST; days_since_epoch[i] > w; ++i, --year) /* just loop */; w -= days_since_epoch[i]; /* * By now we have the current year, and the day of the year. * r->yearday = w; * * On to finding the month (like above). In each month, a certain * number of days-since-New Year have passed, and find the closest * one. * * Consider w=62 (in a non-leap year). Loop will abort on * dsy[i] < w, which happens when dsy[i] == 31+28 (i == 2). * Concludes i == 2, i.e. 3rd month => March. * * (A different approach to use would be to subtract a monthlength * from w repeatedly while counting.) */ if (is_leap(year)) { /* use days_since_leapyear[] in a leap year */ for (i = ARRAY_SIZE(days_since_leapyear) - 1; i > 0 && days_since_leapyear[i] > w; --i) /* just loop */; r->monthday = w - days_since_leapyear[i] + 1; } else { for (i = ARRAY_SIZE(days_since_year) - 1; i > 0 && days_since_year[i] > w; --i) /* just loop */; r->monthday = w - days_since_year[i] + 1; } r->month = i + 1; } static bool time_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_time_info *info = par->matchinfo; unsigned int packet_time; struct xtm current_time; time64_t stamp; /* * We need real time here, but we can neither use skb->tstamp * nor __net_timestamp(). * * skb->tstamp and skb->skb_mstamp_ns overlap, however, they * use different clock types (real vs monotonic). * * Suppose you have two rules: * 1. match before 13:00 * 2. match after 13:00 * * If you match against processing time (ktime_get_real_seconds) it * may happen that the same packet matches both rules if * it arrived at the right moment before 13:00, so it would be * better to check skb->tstamp and set it via __net_timestamp() * if needed. This however breaks outgoing packets tx timestamp, * and causes them to get delayed forever by fq packet scheduler. */ stamp = ktime_get_real_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ stamp -= 60 * sys_tz.tz_minuteswest; /* * xt_time will match when _all_ of the following hold: * - 'now' is in the global time range date_start..date_end * - 'now' is in the monthday mask * - 'now' is in the weekday mask * - 'now' is in the daytime range time_start..time_end * (and by default, libxt_time will set these so as to match) * * note: info->date_start/stop are unsigned 32-bit values that * can hold values beyond y2038, but not after y2106. */ if (stamp < info->date_start || stamp > info->date_stop) return false; packet_time = localtime_1(&current_time, stamp); if (info->daytime_start < info->daytime_stop) { if (packet_time < info->daytime_start || packet_time > info->daytime_stop) return false; } else { if (packet_time < info->daytime_start && packet_time > info->daytime_stop) return false; /** if user asked to ignore 'next day', then e.g. * '1 PM Wed, August 1st' should be treated * like 'Tue 1 PM July 31st'. * * This also causes * 'Monday, "23:00 to 01:00", to match for 2 hours, starting * Monday 23:00 to Tuesday 01:00. */ if ((info->flags & XT_TIME_CONTIGUOUS) && packet_time <= info->daytime_stop) stamp -= SECONDS_PER_DAY; } localtime_2(&current_time, stamp); if (!(info->weekdays_match & (1 << current_time.weekday))) return false; /* Do not spend time computing monthday if all days match anyway */ if (info->monthdays_match != XT_TIME_ALL_MONTHDAYS) { localtime_3(&current_time, stamp); if (!(info->monthdays_match & (1 << current_time.monthday))) return false; } return true; } static int time_mt_check(const struct xt_mtchk_param *par) { const struct xt_time_info *info = par->matchinfo; if (info->daytime_start > XT_TIME_MAX_DAYTIME || info->daytime_stop > XT_TIME_MAX_DAYTIME) { pr_info_ratelimited("invalid argument - start or stop time greater than 23:59:59\n"); return -EDOM; } if (info->flags & ~XT_TIME_ALL_FLAGS) { pr_info_ratelimited("unknown flags 0x%x\n", info->flags & ~XT_TIME_ALL_FLAGS); return -EINVAL; } if ((info->flags & XT_TIME_CONTIGUOUS) && info->daytime_start < info->daytime_stop) return -EINVAL; return 0; } static struct xt_match xt_time_mt_reg __read_mostly = { .name = "time", .family = NFPROTO_UNSPEC, .match = time_mt, .checkentry = time_mt_check, .matchsize = sizeof(struct xt_time_info), .me = THIS_MODULE, }; static int __init time_mt_init(void) { int minutes = sys_tz.tz_minuteswest; if (minutes < 0) /* east of Greenwich */ pr_info("kernel timezone is +%02d%02d\n", -minutes / 60, -minutes % 60); else /* west of Greenwich */ pr_info("kernel timezone is -%02d%02d\n", minutes / 60, minutes % 60); return xt_register_match(&xt_time_mt_reg); } static void __exit time_mt_exit(void) { xt_unregister_match(&xt_time_mt_reg); } module_init(time_mt_init); module_exit(time_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: time-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_time"); MODULE_ALIAS("ip6t_time");
10993 104 607 347 685 104 10223 617 10223 11638 1308 10236 10488 10479 10493 722 722 723 11637 10516 10148 10172 11613 11616 11641 5406 7241 10290 9707 979 754 11297 10730 445 10612 10630 10628 1627 1769 9681 9686 940 9690 17 18 9720 84 139 6886 26 1 26 6987 9678 9669 9704 9677 148 9702 111 554 971 126 12 207 9132 20 209 756 1294 223 1297 1282 26 1 6 1267 721 712 11 75 1266 11 961 543 1296 1299 1299 30 1295 49 645 512 728 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 /* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H #include <linux/err.h> #include <linux/errno.h> #include <linux/jhash.h> #include <linux/list_nulls.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <linux/bit_spinlock.h> #include <linux/rhashtable-types.h> /* * Objects in an rhashtable have an embedded struct rhash_head * which is linked into as hash chain from the hash table - or one * of two or more hash tables when the rhashtable is being resized. * The end of the chain is marked with a special nulls marks which has * the least significant bit set but otherwise stores the address of * the hash bucket. This allows us to be sure we've found the end * of the right list. * The value stored in the hash bucket has BIT(0) used as a lock bit. * This bit must be atomically set before any changes are made to * the chain. To avoid dereferencing this pointer without clearing * the bit first, we use an opaque 'struct rhash_lock_head *' for the * pointer stored in the bucket. This struct needs to be defined so * that rcu_dereference() works on it, but it has no content so a * cast is needed for it to be useful. This ensures it isn't * used by mistake with clearing the lock bit first. */ struct rhash_lock_head {}; /* Maximum chain length before rehash * * The maximum (not average) chain length grows with the size of the hash * table, at a rate of (log N)/(log log N). * * The value of 16 is selected so that even if the hash table grew to * 2^32 you would not expect the maximum chain length to exceed it * unless we are under attack (or extremely unlucky). * * As this limit is only to detect attacks, we don't need to set it to a * lower value as you'd need the chain length to vastly exceed 16 to have * any real effect on the system. */ #define RHT_ELASTICITY 16u /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; unsigned int nest; u32 hash_rnd; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; struct lockdep_map dep_map; struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /* * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards * the msb. * We give it an address, in which the bottom bit is * always 0, and the msb might be significant. * So we shift the address down one bit to align with * expectations and avoid losing a significant bit. * * We never store the NULLS_MARKER in the hash table * itself as we need the lsb for locking. * Instead we store a NULL */ #define RHT_NULLS_MARKER(ptr) \ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) #define INIT_RHT_NULLS_HEAD(ptr) \ ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (char *)he - ht->p.head_offset; } static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { return hash & (tbl->size - 1); } static inline unsigned int rht_key_get_hash(struct rhashtable *ht, const void *key, const struct rhashtable_params params, unsigned int hash_rnd) { unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ if (!__builtin_constant_p(params.key_len)) hash = ht->p.hashfn(key, ht->key_len, hash_rnd); else if (params.key_len) { unsigned int key_len = params.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else if (key_len & (sizeof(u32) - 1)) hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); } else { unsigned int key_len = ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else hash = jhash(key, key_len, hash_rnd); } return hash; } static inline unsigned int rht_key_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const void *key, const struct rhashtable_params params) { unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); return rht_bucket_index(tbl, hash); } static inline unsigned int rht_head_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he, const struct rhashtable_params params) { const char *ptr = rht_obj(ht, he); return likely(params.obj_hashfn) ? rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?: ht->p.key_len, tbl->hash_rnd)) : rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); } /** * rht_grow_above_75 - returns true if nelems > 0.75 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_75(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_shrink_below_30(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && tbl->size > ht->p.min_size; } /** * rht_grow_above_100 - returns true if nelems > table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) > tbl->size && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_grow_above_max - returns true if table is above maximum * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) >= ht->max_elems; } #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); #else static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return 1; } static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { return 1; } #endif /* CONFIG_PROVE_LOCKING */ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); static inline void rhashtable_walk_start(struct rhashtable_iter *iter) { (void)rhashtable_walk_start_check(iter); } void *rhashtable_walk_next(struct rhashtable_iter *iter); void *rhashtable_walk_peek(struct rhashtable_iter *iter); void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg); void rhashtable_destroy(struct rhashtable *ht); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_rcu(p, ht) \ rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_bucket(p, tbl, hash) \ rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_dereference_bucket_rcu(p, tbl, hash) \ rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) static inline struct rhash_lock_head __rcu *const *rht_bucket( const struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : &tbl->buckets[hash]; } /* * We lock a bucket by setting BIT(0) in the pointer - this is always * zero in real pointers. The NULLS mark is never stored in the bucket, * rather we store NULL if the bucket is empty. * bit_spin_locks do not handle contention well, but the whole point * of the hashtable design is to achieve minimum per-bucket contention. * A nested hash table might not have a bucket pointer. In that case * we cannot get a lock. For remove and replace the bucket cannot be * interesting and doesn't need locking. * For insert we allocate the bucket if this is the last bucket_table, * and then take the lock. * Sometimes we unlock a bucket by writing a new pointer there. In that * case we don't need to unlock, but we do need to reset state such as * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() * provides the same release semantics that bit_spin_unlock() provides, * this is safe. * When we write to a bucket without unlocking, we use rht_assign_locked(). */ static inline unsigned long rht_lock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); return flags; } static inline unsigned long rht_lock_nested(struct bucket_table *tbl, struct rhash_lock_head __rcu **bucket, unsigned int subclass) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); return flags; } static inline void rht_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, unsigned long flags) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); local_irq_restore(flags); } static inline struct rhash_head *__rht_ptr( struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) { return (struct rhash_head *) ((unsigned long)p & ~BIT(0) ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* * Where 'bkt' is a bucket and might be locked: * rht_ptr_rcu() dereferences that pointer and clears the lock bit. * rht_ptr() dereferences in a context where the bucket is locked. * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference(*bkt), bkt); } static inline struct rhash_head *rht_ptr( struct rhash_lock_head __rcu *const *bkt, struct bucket_table *tbl, unsigned int hash) { return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { if (rht_is_a_nulls(obj)) obj = NULL; rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); } static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, struct rhash_head *obj, unsigned long flags) { if (rht_is_a_nulls(obj)) obj = NULL; lock_map_release(&tbl->dep_map); rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); local_irq_restore(flags); } /** * rht_for_each_from - iterate over hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each_from(pos, head, tbl, hash) \ for (pos = head; \ !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each - iterate over hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash) /** * rht_for_each_entry_from - iterate over hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ for (pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each_entry - iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ rht_for_each_entry_from(tpos, pos, \ rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash, member) /** * rht_for_each_entry_safe - safely iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @next: the &struct rhash_head to use as next in loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = next, \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** * rht_for_each_rcu_from - iterate over rcu hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ for (({barrier(); }), \ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) /** * rht_for_each_entry_rcu - iterate over rcu hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ rht_for_each_entry_rcu_from(tpos, pos, \ rht_ptr_rcu(rht_bucket(tbl, hash)), \ tbl, hash, member) /** * rhl_for_each_rcu - iterate over rcu hash table list * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_rcu(pos, list) \ for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) /** * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * @member: name of the &struct rlist_head within the hashable struct. * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_entry_rcu(tpos, pos, list, member) \ for (pos = list; pos && rht_entry(tpos, pos, member); \ pos = rcu_dereference_raw(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) { struct rhashtable *ht = arg->ht; const char *ptr = obj; return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } /* Internal function, do not use. */ static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu *const *bkt; struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) continue; return he; } /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; return NULL; } /** * rhashtable_lookup - search hash table * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for an entry with an identical key. The first matching entry is returned. * * This must only be called under the RCU read lock. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(ht, key, params); return he ? rht_obj(ht, he) : NULL; } /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for an entry with an identical key. The first matching entry is returned. * * Only use this function when you have other mechanisms guaranteeing * that the object won't go away after the RCU read lock is released. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup_fast( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { void *obj; rcu_read_lock(); obj = rhashtable_lookup(ht, key, params); rcu_read_unlock(); return obj; } /** * rhltable_lookup - search hash list table * @hlt: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for an entry with an identical key. All matching entries are returned * in a list. * * This must only be called under the RCU read lock. * * Returns the list of entries that match the given key. */ static inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes if there is a clash, * otherwise it returns an error via ERR_PTR(). */ static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; unsigned long flags; unsigned int hash; int elasticity; void *data; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); hash = rht_head_hashfn(ht, tbl, obj, params); elasticity = RHT_ELASTICITY; bkt = rht_bucket_insert(ht, tbl, hash); data = ERR_PTR(-ENOMEM); if (!bkt) goto out; pprev = NULL; flags = rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: rht_unlock(tbl, bkt, flags); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; elasticity--; if (!key || (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } data = rht_obj(ht, head); if (!rhlist) goto out_unlock; list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else rht_assign_unlock(tbl, bkt, obj, flags); data = NULL; goto out; } if (elasticity <= 0) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; /* Inserting at head of list makes unlocking free. */ head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } atomic_inc(&ht->nelems); rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); data = NULL; out: rcu_read_unlock(); return data; out_unlock: rht_unlock(tbl, bkt, flags); goto out; } /** * rhashtable_insert_fast - insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhltable_insert_key - insert object into hash list table * @hlt: hash list table * @key: the pointer to the key * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, const struct rhashtable_params params) { return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, params, true)); } /** * rhltable_insert - insert object into hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { const char *key = rht_obj(&hlt->ht, &list->rhead); key += params.key_offset; return rhltable_insert_key(hlt, key, list, params); } /** * rhashtable_lookup_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * This lookup function may only be used for fixed key hash table (key_len * parameter set). It will BUG() if used inappropriately. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); void *ret; BUG_ON(ht->p.obj_hashfn); ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_fast(), but this function returns the * object if it exists, NULL if it did not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); BUG_ON(ht->p.obj_hashfn); return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); } /** * rhashtable_lookup_insert_key - search and insert object to hash table * with explicit key * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Lookups may occur in parallel with hashtable mutations and resizing. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. * * Returns zero on success. */ static inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; BUG_ON(!ht->p.obj_hashfn || !key); ret = __rhashtable_insert_fast(ht, key, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_key - lookup and insert object into hash table * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_key(), but this function returns the * object if it exists, NULL if it does not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { BUG_ON(!ht->p.obj_hashfn || !key); return __rhashtable_insert_fast(ht, key, obj, params, false); } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); if (he != obj) { struct rhlist_head __rcu **lpprev; pprev = &he->next; if (!rhlist) continue; do { lpprev = &list->next; list = rht_dereference_bucket(list->next, tbl, hash); } while (list && obj != &list->rhead); if (!list) continue; list = rht_dereference_bucket(list->next, tbl, hash); RCU_INIT_POINTER(*lpprev, list); err = 0; break; } obj = rht_dereference_bucket(obj->next, tbl, hash); err = 1; if (rhlist) { list = rht_dereference_bucket(list->next, tbl, hash); if (list) { RCU_INIT_POINTER(list->rhead.next, obj); obj = &list->rhead; err = 0; } } if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj, flags); } goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: if (err > 0) { atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))) schedule_work(&ht->run_work); err = 0; } return err; } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, rhlist)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhashtable_remove_fast - remove object from hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30%. * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { return __rhashtable_remove_fast(ht, obj, params, false); } /** * rhltable_remove - remove object from hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerably slower if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30% * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhltable_remove( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); } /* Internal function, please use rhashtable_replace_fast() instead */ static inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; /* Minimally, the old and new objects must have same hash * (which should mean identifiers are the same). */ hash = rht_head_hashfn(ht, tbl, obj_old, params); if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) return -EINVAL; bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; } rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj_new, flags); } err = 0; goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: return err; } /** * rhashtable_replace_fast - replace an object in hash table * @ht: hash table * @obj_old: pointer to hash head inside object being replaced * @obj_new: pointer to hash head inside object which is new * @params: hash table parameters * * Replacing an object doesn't affect the number of elements in the hash table * or bucket, so we don't need to worry about shrinking or expanding the * table here. * * Returns zero on success, -ENOENT if the entry could not be found, * -EINVAL if hash is not the same for the old and new objects. */ static inline int rhashtable_replace_fast( struct rhashtable *ht, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, obj_new, params)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhltable_walk_enter - Initialise an iterator * @hlt: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptable context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ static inline void rhltable_walk_enter(struct rhltable *hlt, struct rhashtable_iter *iter) { rhashtable_walk_enter(&hlt->ht, iter); } /** * rhltable_free_and_destroy - free elements and destroy hash list table * @hlt: the hash list table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * See documentation for rhashtable_free_and_destroy. */ static inline void rhltable_free_and_destroy(struct rhltable *hlt, void (*free_fn)(void *ptr, void *arg), void *arg) { rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); } static inline void rhltable_destroy(struct rhltable *hlt) { rhltable_free_and_destroy(hlt, NULL, NULL); } #endif /* _LINUX_RHASHTABLE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAPOPS_H #define _LINUX_SWAPOPS_H #include <linux/radix-tree.h> #include <linux/bug.h> #include <linux/mm_types.h> #ifdef CONFIG_MMU #ifdef CONFIG_SWAP #include <linux/swapfile.h> #endif /* CONFIG_SWAP */ /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in * the low-order bits. * * We arrange the `type' and `offset' fields so that `type' is at the six * high-order bits of the swp_entry_t and `offset' is right-aligned in the * remaining bits. Although `type' itself needs only five bits, we allow for * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ #ifdef MAX_PHYSMEM_BITS #define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) #else /* MAX_PHYSMEM_BITS */ #define SWP_PFN_BITS min_t(int, \ sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \ SWP_TYPE_SHIFT) #endif /* MAX_PHYSMEM_BITS */ #define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) /** * Migration swap entry specific bitfield definitions. Layout: * * |----------+--------------------| * | swp_type | swp_offset | * |----------+--------+-+-+-------| * | | resv |D|A| PFN | * |----------+--------+-+-+-------| * * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A) * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D) * * Note: A/D bits will be stored in migration entries iff there're enough * free bits in arch specific swp offset. By default we'll ignore A/D bits * when migrating a page. Please refer to migration_entry_supports_ad() * for more information. If there're more bits besides PFN and A/D bits, * they should be reserved and always be zeros. */ #define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS) #define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1) #define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2) #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) static inline bool is_pfn_swap_entry(swp_entry_t entry); /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { if (pte_swp_exclusive(pte)) pte = pte_swp_clear_exclusive(pte); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); if (pte_swp_uffd_wp(pte)) pte = pte_swp_clear_uffd_wp(pte); return pte; } /* * Store a type+offset into a swp_entry_t in an arch-independent format */ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) { swp_entry_t ret; ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK); return ret; } /* * Extract the `type' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline unsigned swp_type(swp_entry_t entry) { return (entry.val >> SWP_TYPE_SHIFT); } /* * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline pgoff_t swp_offset(swp_entry_t entry) { return entry.val & SWP_OFFSET_MASK; } /* * This should only be called upon a pfn swap entry to get the PFN stored * in the swap entry. Please refers to is_pfn_swap_entry() for definition * of pfn swap entry. */ static inline unsigned long swp_offset_pfn(swp_entry_t entry) { VM_BUG_ON(!is_pfn_swap_entry(entry)); return swp_offset(entry) & SWP_PFN_MASK; } /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { return !pte_none(pte) && !pte_present(pte); } /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. */ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; pte = pte_swp_clear_flags(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } /* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. */ static inline pte_t swp_entry_to_pte(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pte(arch_entry); } static inline swp_entry_t radix_to_swp_entry(void *arg) { swp_entry_t entry; entry.val = xa_to_value(arg); return entry; } static inline void *swp_to_radix_entry(swp_entry_t entry) { return xa_mk_value(entry.val); } #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_READ, offset); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_WRITE, offset); } static inline bool is_device_private_entry(swp_entry_t entry) { int type = swp_type(entry); return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; } static inline bool is_writable_device_private_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; } #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline bool is_device_private_entry(swp_entry_t entry) { return false; } static inline bool is_writable_device_private_entry(swp_entry_t entry) { return false; } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { return false; } #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION static inline int is_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ || swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || swp_type(entry) == SWP_MIGRATION_WRITE); } static inline int is_writable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); } static inline int is_readable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ); } static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); } static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ, offset); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_WRITE, offset); } /* * Returns whether the host has large enough swap offset field to support * carrying over pgtable A/D bits for page migrations. The result is * pretty much arch specific. */ static inline bool migration_entry_supports_ad(void) { #ifdef CONFIG_SWAP return swap_migration_ad_supported; #else /* CONFIG_SWAP */ return false; #endif /* CONFIG_SWAP */ } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_YOUNG); return entry; } static inline bool is_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_offset(entry) & SWP_MIG_YOUNG; /* Keep the old behavior of aging page after migration */ return false; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_DIRTY); return entry; } static inline bool is_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_offset(entry) & SWP_MIG_DIRTY; /* Keep the old behavior of clean page after migration */ return false; } extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline int is_migration_entry(swp_entry_t swp) { return 0; } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; } static inline int is_readable_migration_entry(swp_entry_t entry) { return 0; } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { return entry; } static inline bool is_migration_entry_young(swp_entry_t entry) { return false; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } static inline bool is_migration_entry_dirty(swp_entry_t entry) { return false; } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE /* * Support for hardware poisoned pages */ static inline swp_entry_t make_hwpoison_entry(struct page *page) { BUG_ON(!PageLocked(page)); return swp_entry(SWP_HWPOISON, page_to_pfn(page)); } static inline int is_hwpoison_entry(swp_entry_t entry) { return swp_type(entry) == SWP_HWPOISON; } #else static inline swp_entry_t make_hwpoison_entry(struct page *page) { return swp_entry(0, 0); } static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } #endif typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) /* * "Poisoned" here is meant in the very general sense of "future accesses are * invalid", instead of referring very specifically to hardware memory errors. * This marker is meant to represent any of various different causes of this. * * Note that, when encountered by the faulting logic, PTEs with this marker will * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error * logic. */ #define PTE_MARKER_POISONED BIT(1) /* * Indicates that, on fault, this PTE will case a SIGSEGV signal to be * sent. This means guard markers behave in effect as if the region were mapped * PROT_NONE, rather than if they were a memory hole or equivalent. */ #define PTE_MARKER_GUARD BIT(2) #define PTE_MARKER_MASK (BIT(3) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { return swp_entry(SWP_PTE_MARKER, marker); } static inline bool is_pte_marker_entry(swp_entry_t entry) { return swp_type(entry) == SWP_PTE_MARKER; } static inline pte_marker pte_marker_get(swp_entry_t entry) { return swp_offset(entry) & PTE_MARKER_MASK; } static inline bool is_pte_marker(pte_t pte) { return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); } static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); } static inline swp_entry_t make_poisoned_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_POISONED); } static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_POISONED); } static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } static inline int is_guard_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_GUARD); } /* * This is a special version to check pte_none() just to cover the case when * the pte is a pte marker. It existed because in many cases the pte marker * should be seen as a none pte; it's just that we have stored some information * onto the none pte so it becomes not-none any more. * * It should be used when the pte is file-backed, ram-based and backing * userspace pages, like shmem. It is not needed upon pgtables that do not * support pte markers at all. For example, it's not needed on anonymous * memory, kernel-only memory (including when the system is during-boot), * non-ram based generic file-system. It's fine to be used even there, but the * extra pte marker check will be pure overhead. */ static inline int pte_none_mostly(pte_t pte) { return pte_none(pte) || is_pte_marker(pte); } static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the * corresponding page is locked */ BUG_ON(is_migration_entry(entry) && !PageLocked(p)); return p; } static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) { struct folio *folio = pfn_folio(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the * corresponding folio is locked */ BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); return folio; } /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They can either be used to represent unaddressable device * memory, to restrict access to a page undergoing migration or to represent a * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { /* Make sure the swp offset can always store the needed fields */ BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page); extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new); extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { swp_entry_t arch_entry; if (pmd_swp_soft_dirty(pmd)) pmd = pmd_swp_clear_soft_dirty(pmd); if (pmd_swp_uffd_wp(pmd)) pmd = pmd_swp_clear_uffd_wp(pmd); arch_entry = __pmd_to_swp_entry(pmd); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pmd(arch_entry); } static inline int is_pmd_migration_entry(pmd_t pmd) { return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { BUILD_BUG(); } static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { BUILD_BUG(); } static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { return swp_entry(0, 0); } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } static inline int is_pmd_migration_entry(pmd_t pmd) { return 0; } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */
17840 17831 17840 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 // SPDX-License-Identifier: GPL-2.0-only /* * xsave/xrstor support. * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ #include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> #include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/coredump.h> #include <linux/sort.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> #include <asm/cpuid/api.h> #include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> #include <uapi/asm/elf.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define for_each_extended_xfeature(bit, mask) \ (bit) = FIRST_EXTENDED_XFEATURE; \ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace * xfeature is completely unused. We use other mechanisms * to save/restore PT state in Linux. */ static const char *xfeature_names[] = { "x87 floating point registers", "SSE registers", "AVX registers", "MPX bounds registers", "MPX CSR", "AVX-512 opmask", "AVX-512 Hi256", "AVX-512 ZMM_Hi256", "Processor Trace (unused)", "Protection Keys User registers", "PASID state", "Control-flow User registers", "Control-flow Kernel registers (KVM only)", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "AMX Tile config", "AMX Tile data", "APX registers", "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_FP] = X86_FEATURE_FPU, [XFEATURE_SSE] = X86_FEATURE_XMM, [XFEATURE_YMM] = X86_FEATURE_AVX, [XFEATURE_BNDREGS] = X86_FEATURE_MPX, [XFEATURE_BNDCSR] = X86_FEATURE_MPX, [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, [XFEATURE_APX] = X86_FEATURE_APX, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; /* * Ordering of xstate components in uncompacted format: The xfeature * number does not necessarily indicate its position in the XSAVE buffer. * This array defines the traversal order of xstate features. */ static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static inline unsigned int next_xfeature_order(unsigned int i, u64 mask) { for (; xfeature_uncompact_order[i] != -1; i++) { if (mask & BIT_ULL(xfeature_uncompact_order[i])) break; } return i; } /* Iterate xstate features in uncompacted order: */ #define for_each_extended_xfeature_in_order(i, mask) \ for (i = 0; \ i = next_xfeature_order(i, mask), \ xfeature_uncompact_order[i] != -1; \ i++) #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) /* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; u64 xfeatures_print; /* * So we use FLS here to be able to print the most advanced * feature that was requested but is missing. So if a driver * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the * missing AVX feature - this is the most informative message * to users: */ if (xfeatures_missing) xfeatures_print = xfeatures_missing; else xfeatures_print = xfeatures_needed; xfeature_idx = fls64(xfeatures_print)-1; max_idx = ARRAY_SIZE(xfeature_names)-1; xfeature_idx = min(xfeature_idx, max_idx); *feature_name = xfeature_names[xfeature_idx]; } if (xfeatures_missing) return 0; return 1; } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); static bool xfeature_is_aligned64(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; } static bool xfeature_is_supervisor(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; } static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) { unsigned int offs, i; /* * Non-compacted format and legacy features use the cached fixed * offsets. */ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* * Compacted format offsets depend on the actual content of the * compacted xsave area which is determined by the xcomp_bv header * field. */ offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; for_each_extended_xfeature(i, xcomp_bv) { if (xfeature_is_aligned64(i)) offs = ALIGN(offs, 64); if (i == xfeature) break; offs += xstate_sizes[i]; } return offs; } /* * Enable the extended processor state save/restore feature. * Called once per CPU onlining. */ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); /* * Must happen after CR4 setup and before xsetbv() to allow KVM * lazy passthrough. Write independent of the dynamic state static * key as that does not work on the boot CPU. This also ensures * that any stale state is wiped out from XFD. Reset the per CPU * xfd cache too. */ if (cpu_feature_enabled(X86_FEATURE_XFD)) xfd_set_state(init_fpstate.xfd); /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } } static bool xfeature_enabled(enum xfeature xfeature) { return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2) { return xstate_offsets[*(unsigned int *)xfeature1] - xstate_offsets[*(unsigned int *)xfeature2]; } /* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. Also, create an ordered * list of xfeatures for handling out-of-order offsets. */ static void __init setup_xstate_cache(void) { u32 eax, ebx, ecx, edx, xfeature, i = 0; /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form * or standard form. */ xstate_offsets[XFEATURE_FP] = 0; xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, xmm_space); xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) { cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx); xstate_sizes[xfeature] = eax; xstate_flags[xfeature] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ if (xfeature_is_supervisor(xfeature)) continue; xstate_offsets[xfeature] = ebx; /* Populate the list of xfeatures before sorting */ xfeature_uncompact_order[i++] = xfeature; } /* * Sort xfeatures by their offsets to support out-of-order * offsets in the uncompacted format. */ sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL); } /* * Print out all the supported xstate features: */ static void __init print_xstate_features(void) { int i; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = BIT_ULL(i); const char *name; if (cpu_has_xfeatures(mask, &name)) pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name); } } /* * This check is important because it is easy to get XSTATE_* * confused with XSTATE_BIT_*. */ #define CHECK_XFEATURE(nr) do { \ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ WARN_ON(nr >= XFEATURE_MAX); \ } while (0) /* * Print out xstate component offsets and sizes */ static void __init print_xstate_offset_size(void) { int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), i, xstate_sizes[i]); } } /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ static __init void os_xrstor_booting(struct xregs_state *xstate) { u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; u32 lmask = mask; u32 hmask = mask >> 32; int err; if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* * We should never fault when copying from a kernel buffer, and the FPU * state we set at boot time should be valid. */ WARN_ON_FPU(err); } /* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit * feature list and does not use XFEATURE_MASK*SUPPORTED to catch * newly added supported features at build time and make people * actually look at the init state for the new feature. */ #define XFEATURES_INIT_FPSTATE_HANDLED \ (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_CET_KERNEL | \ XFEATURE_MASK_XTILE | \ XFEATURE_MASK_APX) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; print_xstate_features(); xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 */ os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the * legacy area. Adding new features requires to ensure that init * state is all zeroes or if not to add the necessary handling * here. */ fxsave(&init_fpstate.regs.fxsave); } int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx); return eax; } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ static int validate_user_xstate_header(const struct xstate_header *hdr, struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ if (hdr->xcomp_bv) return -EINVAL; /* * If 'reserved' is shrunken to add a new field, make sure to validate * that new field here! */ BUILD_BUG_ON(sizeof(hdr->reserved) != 48); /* No reserved bits may be set */ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) return -EINVAL; return 0; } static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; static int should_dump = 1; if (!should_dump) return; should_dump = 0; /* * Dump out a few leaves past the ones that we support * just in case there are some goodies up there */ for (i = 0; i < XFEATURE_MAX + 10; i++) { cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx); } } #define XSTATE_WARN_ON(x, fmt, ...) do { \ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ __xstate_dump_leaves(); \ } \ } while (0) #define XCHECK_SZ(sz, nr, __struct) ({ \ if (WARN_ONCE(sz != sizeof(__struct), \ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ true; \ }) /** * check_xtile_data_against_struct - Check tile data state size. * * Calculate the state size by multiplying the single tile size which is * recorded in a C struct, and the number of tiles that the CPU informs. * Compare the provided size with the calculation. * * @size: The tile data state size * * Returns: 0 on success, -EINVAL on mismatch. */ static int __init check_xtile_data_against_struct(int size) { u32 max_palid, palid, state_size; u32 eax, ebx, ecx, edx; u16 max_tile; /* * Check the maximum palette id: * eax: the highest numbered palette subleaf. */ cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx); /* * Cross-check each tile size and find the maximum number of * supported tiles. */ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { u16 tile_size, max; /* * Check the tile size info: * eax[31:16]: bytes per title * ebx[31:16]: the max names (or max number of tiles) */ cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx); tile_size = eax >> 16; max = ebx >> 16; if (tile_size != sizeof(struct xtile_data)) { pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", __stringify(XFEATURE_XTILE_DATA), sizeof(struct xtile_data), tile_size); __xstate_dump_leaves(); return -EINVAL; } if (max > max_tile) max_tile = max; } state_size = sizeof(struct xtile_data) * max_tile; if (size != state_size) { pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", __stringify(XFEATURE_XTILE_DATA), state_size, size); __xstate_dump_leaves(); return -EINVAL; } return 0; } /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); /* * Match each CPU state with the corresponding software * structure. */ switch (nr) { case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state); case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } return true; } static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; unsigned int offset, i; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); if (compacted) { offset = xfeature_get_offset(xfeatures, topmost); } else { /* Walk through the xfeature order to pick the last */ for_each_extended_xfeature_in_order(i, xfeatures) topmost = xfeature_uncompact_order[i]; offset = xstate_offsets[topmost]; } return offset + xstate_sizes[topmost]; } /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. * * Independent XSAVE features allocate their own buffers and are not * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (!check_xstate_against_struct(i)) return false; /* * Supervisor state components can be managed only by * XSAVES. */ if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); return false; } } size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); XSTATE_WARN_ON(size != kernel_size, "size %u != kernel_size %u\n", size, kernel_size); return size == kernel_size; } /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. * * This also takes compaction into account. So this works for * XSAVEC as well. */ static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 1: * EBX enumerates the size (in bytes) required by * the XSAVES instruction for an XSAVE area * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. * * When XSAVES is not available but XSAVEC is (virt), then there * are no supervisor states, but XSAVEC still uses compacted * format. */ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); return ebx; } /* * Get the total size of the enabled xstates without the independent supervisor * features. */ static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) return get_compacted_size(); /* Disable independent features. */ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; } static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 0: * EBX enumerates the size (in bytes) required by * the XSAVE instruction for an XSAVE area * containing all the *user* state components * corresponding to bits currently set in XCR0. */ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); return ebx; } static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* * XSAVES kernel size includes supervisor states and uses compacted * format. XSAVEC uses compacted format, but does not save * supervisor states. * * XSAVE[OPT] do not support supervisor states so kernel and user * size is identical. */ if (compacted) kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; fpu_kernel_cfg.max_size = kernel_size; fpu_user_cfg.max_size = user_size; fpu_kernel_cfg.default_size = kernel_default_size; fpu_user_cfg.default_size = xstate_calculate_size(fpu_user_cfg.default_features, false); guest_default_cfg.size = xstate_calculate_size(guest_default_cfg.features, compacted); return 0; } /* * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { pr_info("x86/fpu: XSAVE disabled\n"); fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); /* Restore the legacy size.*/ fpu_kernel_cfg.max_size = legacy_size; fpu_kernel_cfg.default_size = legacy_size; fpu_user_cfg.max_size = legacy_size; fpu_user_cfg.default_size = legacy_size; guest_default_cfg.size = legacy_size; /* * Prevent enabling the static branch which enables writes to the * XFD MSR. */ init_fpstate.xfd = 0; fpstate_reset(x86_task_fpu(current)); } static u64 __init host_default_mask(void) { /* * Exclude dynamic features (require userspace opt-in) and features * that are supported only for KVM guests. */ return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR); } static u64 __init guest_default_mask(void) { /* * Exclude dynamic features, which require userspace opt-in even * for KVM guests. */ return ~(u64)XFEATURE_MASK_USER_DYNAMIC; } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; u64 xfeatures; int err; int i; if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; } if (!boot_cpu_has(X86_FEATURE_XSAVE)) { pr_info("x86/fpu: x87 FPU will use %s\n", boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return; } /* * Find user xstates supported by the processor. */ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX && fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) { /* * This is a problematic CPU configuration where two * conflicting state components are both enumerated. */ pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & XFEATURE_MASK_INDEPENDENT; /* * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { unsigned short cid = xsave_cpuid_features[i]; /* Careful: X86_FEATURE_FPU is 0! */ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; else fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; /* * Now, given maximum feature set, determine default values by * applying default masks. */ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask(); fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask(); guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask(); /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; /* * Initialize the default XFD state in initfp_state and enable the * dynamic sizing mechanism if dynamic states are available. The * static key cannot be enabled here because this runs before * jump_label_init(). This is delayed to an initcall. */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; /* Set up compaction feature bit */ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || cpu_feature_enabled(X86_FEATURE_XSAVES)) setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); /* Cache size, offset and flags for initialization */ setup_xstate_cache(); err = init_xstate_size(); if (err) goto out_disable; /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); /* * init_fpstate excludes dynamic states as they are large but init * state is zero. */ init_fpstate.size = fpu_kernel_cfg.default_size; init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } setup_init_fpu_buf(); /* * Paranoia check whether something in the setup modified the * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } /* * CPU capabilities initialization runs before FPU init. So * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely * functional, set the feature bit so depending code works. */ setup_force_cpu_cap(X86_FEATURE_OSXSAVE); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ fpu__init_disable_system_xstate(legacy_size); } /* * Restore minimal FPU state after suspend: */ void fpu__resume_cpu(void) { /* * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } if (fpu_state_size_dynamic()) wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd); } /* * Given an xstate feature nr, calculate where in the xsave * buffer the state is. Callers should ensure that the buffer * is valid. */ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { u64 xcomp_bv = xsave->header.xcomp_bv; if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); } /* * Given the xsave area and a state inside, this function returns the * address of the state. * * This is the API that is called to get xstate address in either * standard format or compacted format of xsave area. * * Note that if there is no data for the field in the xsave buffer * this will return NULL. * * Inputs: * xstate: the thread's storage area for all FPU data * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. */ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { /* * Do we even *have* xsave state? */ if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; /* * We should not ever be requesting features that we * have not enabled. */ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; /* * This assumes the last 'xsave*' instruction to * have requested that 'xfeature_nr' be saved. * If it did not, we might be seeing and old value * of the field in the buffer. * * This can happen because the last 'xsave' did not * request that this feature be saved (unlikely) * or because the "init optimization" caused it * to not be saved. */ if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) return NULL; return __raw_xsave_addr(xsave, xfeature_nr); } EXPORT_SYMBOL_GPL(get_xsave_addr); /* * Given an xstate feature nr, calculate where in the xsave buffer the state is. * The xsave buffer should be in standard format, not compacted (e.g. user mode * signal frames). */ void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) { if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; return (void __user *)xsave + xstate_offsets[xfeature_nr]; } #ifdef CONFIG_ARCH_HAS_PKEYS /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { u32 old_pkru, new_pkru_bits = 0; int pkey_shift; /* * This check implies XSAVE support. OSPKE only gets * set if we enable XSAVE and we enable PKU in XCR0. */ if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return -EINVAL; /* * This code should only be called with valid 'pkey' * values originating from in-kernel users. Complain * if a bad value is observed. */ if (WARN_ON_ONCE(pkey >= arch_max_pkey())) return -EINVAL; /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; if (init_val & PKEY_DISABLE_WRITE) new_pkru_bits |= PKRU_WD_BIT; /* Shift the bits in to the correct place in PKRU for pkey: */ pkey_shift = pkey * PKRU_BITS_PER_PKEY; new_pkru_bits <<= pkey_shift; /* Get old PKRU and mask off any old bits in place: */ old_pkru = read_pkru(); old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); /* Write old part along with new part: */ write_pkru(old_pkru | new_pkru_bits); return 0; } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, void *init_xstate, unsigned int size) { membuf_write(to, from_xstate ? xstate : init_xstate, size); } /** * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy * @xfeatures: The mask of xfeatures to save (XSAVE mode only) * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u64 xfeatures, u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int zerofrom, i, xfeature; struct xstate_header header; u64 mask; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; /* Mask out the feature bits depending on copy mode */ switch (copy_mode) { case XSTATE_COPY_FP: header.xfeatures &= XFEATURE_MASK_FP; break; case XSTATE_COPY_FX: header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; break; case XSTATE_COPY_XSAVE: header.xfeatures &= fpstate->user_xfeatures & xfeatures; break; } /* Copy FP state up to MXCSR */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, &xinit->i387, off_mxcsr); /* Copy MXCSR when SSE or YMM are set in the feature mask */ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, MXCSR_AND_FLAGS_SIZE); /* Copy the remaining FP state */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387.st_space, &xinit->i387.st_space, sizeof(xsave->i387.st_space)); /* Copy the SSE state - shared with YMM, but independently managed */ copy_feature(header.xfeatures & XFEATURE_MASK_SSE, &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, sizeof(xsave->i387.xmm_space)); if (copy_mode != XSTATE_COPY_XSAVE) goto out; /* Zero the padding area */ membuf_zero(&to, sizeof(xsave->i387.padding)); /* Copy xsave->i387.sw_reserved */ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); /* Copy the user space relevant state of @xsave->header */ membuf_write(&to, &header, sizeof(header)); zerofrom = offsetof(struct xregs_state, extended_state_area); /* * This 'mask' indicates which states to copy from fpstate. * Those extended states that are not present in fpstate are * either disabled or initialized: * * In non-compacted format, disabled features still occupy * state space but there is no state to copy from in the * compacted init_fpstate. The gap tracking will zero these * states. * * The extended features have an all zeroes init state. Thus, * remove them from 'mask' to zero those features in the user * buffer instead of retrieving them from init_fpstate. */ mask = header.xfeatures; for_each_extended_xfeature_in_order(i, mask) { xfeature = xfeature_uncompact_order[i]; /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ if (zerofrom < xstate_offsets[xfeature]) membuf_zero(&to, xstate_offsets[xfeature] - zerofrom); if (xfeature == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the * XSAVE buffer. Use the provided value. */ pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, __raw_xsave_addr(xsave, xfeature), xstate_sizes[xfeature]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature]; } out: if (to.left) membuf_zero(&to, to.left); } /** * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @tsk: The task from which to copy the saved xstate * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate, x86_task_fpu(tsk)->fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { if (kbuf) { memcpy(dst, kbuf + offset, size); } else { if (copy_from_user(dst, ubuf + offset, size)) return -EFAULT; } return 0; } /** * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate * @fpstate: The fpstate buffer to copy to * @kbuf: The UABI format buffer, if it comes from the kernel * @ubuf: The UABI format buffer, if it comes from userspace * @pkru: The location to write the PKRU value to * * Converts from the UABI format into the kernel internal hardware * dependent format. * * This function ultimately has three different callers with distinct PKRU * behavior. * 1. When called from sigreturn the PKRU register will be restored from * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to * @fpstate is sufficient to cover this case, but the caller will also * pass a pointer to the thread_struct's pkru field in @pkru and updating * it is harmless. * 2. When called from ptrace the PKRU register will be restored from the * thread_struct's pkru field. A pointer to that is passed in @pkru. * The kernel will restore it manually, so the XRSTOR behavior that resets * the PKRU register to the hardware init value (0) if the corresponding * xfeatures bit is not set is emulated here. * 3. When called from KVM the PKRU register will be restored from the vcpu's * pkru field. A pointer to that is passed in @pkru. KVM hasn't used * XRSTOR and hasn't had the PKRU resetting behavior described above. To * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures * bit is not set. */ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf, u32 *pkru) { struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; int i; offset = offsetof(struct xregs_state, header); if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; if (hdr.xfeatures & mask) { u32 mxcsr[2]; offset = offsetof(struct fxregs_state, mxcsr); if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) return -EFAULT; /* Reserved bits in MXCSR must be zero. */ if (mxcsr[0] & ~mxcsr_feature_mask) return -EINVAL; /* SSE and YMM require MXCSR even when FP is not in use. */ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { xsave->i387.mxcsr = mxcsr[0]; xsave->i387.mxcsr_mask = mxcsr[1]; } } for (i = 0; i < XFEATURE_MAX; i++) { mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) return -EFAULT; } } if (hdr.xfeatures & XFEATURE_MASK_PKRU) { struct pkru_state *xpkru; xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); *pkru = xpkru->pkru; } else { /* * KVM may pass NULL here to indicate that it does not need * PKRU updated. */ if (pkru) *pkru = 0; } /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: */ xsave->header.xfeatures |= hdr.xfeatures; return 0; } /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) { return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); } /* * Convert from a sigreturn standard-format user-space buffer to kernel * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; return true; } /** * xsaves - Save selected components to a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to save * * The @xstate buffer must be 64 byte aligned and correctly initialized as * XSAVES does not write the full xstate header. Before first use the * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } /** * xrstors - Restore selected components from a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to restore * * The @xstate buffer must be 64 byte aligned and correctly initialized * otherwise XRSTORS from that buffer can #GP. * * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } #if IS_ENABLED(CONFIG_KVM) void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature) { void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); } EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); #endif #ifdef CONFIG_X86_64 #ifdef CONFIG_X86_DEBUG_FPU /* * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask * can safely operate on the @fpstate buffer. */ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) { u64 xfd = __this_cpu_read(xfd_state); if (fpstate->xfd == xfd) return true; /* * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd) return false; /* * XRSTOR(S) from init_fpstate are always correct as it will just * bring all components into init state and not read from the * buffer. XSAVE(S) raises #PF after init. */ if (fpstate == &init_fpstate) return rstor; /* * XSAVE(S): clone(), fpu_swap_kvm_fpstate() * XRSTORS(S): fpu_swap_kvm_fpstate() */ /* * No XSAVE/XRSTOR instructions (except XSAVE itself) touch * the buffer area for XFD-disabled state components. */ mask &= ~xfd; /* * Remove features which are valid in fpstate. They * have space allocated in fpstate. */ mask &= ~fpstate->xfeatures; /* * Any remaining state components in 'mask' might be written * by XSAVE/XRSTOR. Fail validation it found. */ return !mask; } void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); } #endif /* CONFIG_X86_DEBUG_FPU */ static int __init xfd_update_static_branch(void) { /* * If init_fpstate.xfd has bits set then dynamic features are * available and the dynamic sizing must be enabled. */ if (init_fpstate.xfd) static_branch_enable(&__fpu_state_size_dynamic); return 0; } arch_initcall(xfd_update_static_branch) void fpstate_free(struct fpu *fpu) { if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) vfree(fpu->fpstate); } /** * fpstate_realloc - Reallocate struct fpstate for the requested new features * * @xfeatures: A bitmap of xstate features which extend the enabled features * of that task * @ksize: The required size for the kernel buffer * @usize: The required size for user space buffers * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations * * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer * terminates quickly, vfree()-induced IPIs may be a concern, but tasks * with large states are likely to live longer. * * Returns: 0 on success, -ENOMEM on allocation error. */ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { struct fpu *fpu = x86_task_fpu(current); struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); newfps = vzalloc(fpsize); if (!newfps) return -ENOMEM; newfps->size = ksize; newfps->user_size = usize; newfps->is_valloc = true; /* * When a guest FPU is supplied, use @guest_fpu->fpstate * as reference independent whether it is in use or not. */ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; /* Determine whether @curfps is the active fpstate */ in_use = fpu->fpstate == curfps; if (guest_fpu) { newfps->is_guest = true; newfps->is_confidential = curfps->is_confidential; newfps->in_use = curfps->in_use; guest_fpu->xfeatures |= xfeatures; guest_fpu->uabi_size = usize; } fpregs_lock(); /* * If @curfps is in use, ensure that the current state is in the * registers before swapping fpstate as that might invalidate it * due to layout changes. */ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); if (guest_fpu) { guest_fpu->fpstate = newfps; /* If curfps is active, update the FPU fpstate pointer */ if (in_use) fpu->fpstate = newfps; } else { fpu->fpstate = newfps; } if (in_use) xfd_update_state(fpu->fpstate); fpregs_unlock(); /* Only free valloc'ed state */ if (curfps && curfps->is_valloc) vfree(curfps); return 0; } static int validate_sigaltstack(unsigned int usize) { struct task_struct *thread, *leader = current->group_leader; unsigned long framesize = get_sigframe_size(); lockdep_assert_held(&current->sighand->siglock); /* get_sigframe_size() is based on fpu_user_cfg.max_size */ framesize -= fpu_user_cfg.max_size; framesize += usize; for_each_thread(leader, thread) { if (thread->sas_ss_size && thread->sas_ss_size < framesize) return -ENOSPC; } return 0; } static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) { /* * This deliberately does not exclude !XSAVES as we still might * decide to optionally context switch XCR0 or talk the silicon * vendors into extending XFD for the pre AMX states, especially * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; int ret = 0; /* Check whether fully enabled */ if ((permitted & requested) == requested) return 0; /* * Calculate the resulting kernel state size. Note, @permitted also * contains supervisor xfeatures even though supervisor are always * permitted for kernel and guest FPUs, and never permitted for user * FPUs. */ mask = permitted | requested; ksize = xstate_calculate_size(mask, compacted); /* * Calculate the resulting user state size. Take care not to clobber * the supervisor xfeatures in the new mask! */ usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false); if (!guest) { ret = validate_sigaltstack(usize); if (ret) return ret; } perm = guest ? &fpu->guest_perm : &fpu->perm; /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ WRITE_ONCE(perm->__state_perm, mask); /* Protected by sighand lock */ perm->__state_size = ksize; perm->__user_state_size = usize; return ret; } /* * Permissions array to map facilities with more than one component */ static const u64 xstate_prctl_req[XFEATURE_MAX] = { [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, }; static int xstate_request_perm(unsigned long idx, bool guest) { u64 permitted, requested; int ret; if (idx >= XFEATURE_MAX) return -EINVAL; /* * Look up the facility mask which can require more than * one xstate component. */ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); requested = xstate_prctl_req[idx]; if (!requested) return -EOPNOTSUPP; if ((fpu_user_cfg.max_features & requested) != requested) return -EOPNOTSUPP; /* Lockless quick check */ permitted = xstate_get_group_perm(guest); if ((permitted & requested) == requested) return 0; /* Protect against concurrent modifications */ spin_lock_irq(&current->sighand->siglock); permitted = xstate_get_group_perm(guest); /* First vCPU allocation locks the permissions. */ if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) ret = -EBUSY; else ret = __xstate_request_perm(permitted, requested, guest); spin_unlock_irq(&current->sighand->siglock); return ret; } int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) { u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; struct fpu_state_perm *perm; unsigned int ksize, usize; struct fpu *fpu; if (!xfd_event) { if (!guest_fpu) pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); return 0; } /* Protect against concurrent modifications */ spin_lock_irq(&current->sighand->siglock); /* If not permitted let it die */ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { spin_unlock_irq(&current->sighand->siglock); return -EPERM; } fpu = x86_task_fpu(current->group_leader); perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; /* * The feature is permitted. State size is sufficient. Dropping * the lock is safe here even if more features are added from * another task, the retrieved buffer sizes are valid for the * currently requested feature(s). */ spin_unlock_irq(&current->sighand->siglock); /* * Try to allocate a new fpstate. If that fails there is no way * out. */ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) return -EFAULT; return 0; } int xfd_enable_feature(u64 xfd_err) { return __xfd_enable_feature(xfd_err, NULL); } #else /* CONFIG_X86_64 */ static inline int xstate_request_perm(unsigned long idx, bool guest) { return -EPERM; } #endif /* !CONFIG_X86_64 */ u64 xstate_get_guest_group_perm(void) { return xstate_get_group_perm(true); } EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations * @option: A subfunction of arch_prctl() * @arg2: option argument * Return: 0 if successful; otherwise, an error code * * Option arguments: * * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info * ARCH_REQ_XCOMP_PERM: Facility number requested * * For facilities which require more than one XSTATE component, the request * must be the highest state component number related to that facility, * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; return put_user(supported, uptr); case ARCH_GET_XCOMP_PERM: /* * Lockless snapshot as it can also change right after the * dropping the lock. */ permitted = xstate_get_host_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_GET_XCOMP_GUEST_PERM: permitted = xstate_get_guest_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_REQ_XCOMP_GUEST_PERM: guest = true; fallthrough; case ARCH_REQ_XCOMP_PERM: if (!IS_ENABLED(CONFIG_X86_64)) return -EOPNOTSUPP; return xstate_request_perm(idx, guest); default: return -EINVAL; } } #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 * use in the task. Report -1 if no AVX-512 usage. */ static void avx512_status(struct seq_file *m, struct task_struct *task) { unsigned long timestamp; long delta = -1; /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */ if (task->flags & (PF_KTHREAD | PF_USER_WORKER)) return; timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); if (timestamp) { delta = (long)(jiffies - timestamp); /* * Cap to LONG_MAX if time difference > LONG_MAX */ if (delta < 0) delta = LONG_MAX; delta = jiffies_to_msecs(delta); } seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); seq_putc(m, '\n'); } /* * Report architecture specific information */ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { /* * Report AVX512 state if the processor and build option supported. */ if (cpu_feature_enabled(X86_FEATURE_AVX512F)) avx512_status(m, task); return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ #ifdef CONFIG_COREDUMP static const char owner_name[] = "LINUX"; /* * Dump type, size, offset and flag values for every xfeature that is present. */ static int dump_xsave_layout_desc(struct coredump_params *cprm) { int num_records = 0; int i; for_each_extended_xfeature(i, fpu_user_cfg.max_features) { struct x86_xfeat_component xc = { .type = i, .size = xstate_sizes[i], .offset = xstate_offsets[i], /* reserved for future use */ .flags = 0, }; if (!dump_emit(cprm, &xc, sizeof(xc))) return 0; num_records++; } return num_records; } static u32 get_xsave_desc_size(void) { u32 cnt = 0; u32 i; for_each_extended_xfeature(i, fpu_user_cfg.max_features) cnt++; return cnt * (sizeof(struct x86_xfeat_component)); } int elf_coredump_extra_notes_write(struct coredump_params *cprm) { int num_records = 0; struct elf_note en; if (!fpu_user_cfg.max_features) return 0; en.n_namesz = sizeof(owner_name); en.n_descsz = get_xsave_desc_size(); en.n_type = NT_X86_XSAVE_LAYOUT; if (!dump_emit(cprm, &en, sizeof(en))) return 1; if (!dump_emit(cprm, owner_name, en.n_namesz)) return 1; if (!dump_align(cprm, 4)) return 1; num_records = dump_xsave_layout_desc(cprm); if (!num_records) return 1; /* Total size should be equal to the number of records */ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) return 1; return 0; } int elf_coredump_extra_notes_size(void) { int size; if (!fpu_user_cfg.max_features) return 0; /* .note header */ size = sizeof(struct elf_note); /* Name plus alignment to 4 bytes */ size += roundup(sizeof(owner_name), 4); size += get_xsave_desc_size(); return size; } #endif /* CONFIG_COREDUMP */
5255 5249 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM workqueue #if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WORKQUEUE_H #include <linux/tracepoint.h> #include <linux/workqueue.h> struct pool_workqueue; /** * workqueue_queue_work - called when a work gets queued * @req_cpu: the requested cpu * @pwq: pointer to struct pool_workqueue * @work: pointer to struct work_struct * * This event occurs when a work is queued immediately or once a * delayed work is actually queued on a workqueue (ie: once the delay * has been reached). */ TRACE_EVENT(workqueue_queue_work, TP_PROTO(int req_cpu, struct pool_workqueue *pwq, struct work_struct *work), TP_ARGS(req_cpu, pwq, work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) __string( workqueue, pwq->wq->name) __field( int, req_cpu ) __field( int, cpu ) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; __assign_str(workqueue); __entry->req_cpu = req_cpu; __entry->cpu = pwq->pool->cpu; ), TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d", __entry->work, __entry->function, __get_str(workqueue), __entry->req_cpu, __entry->cpu) ); /** * workqueue_activate_work - called when a work gets activated * @work: pointer to struct work_struct * * This event occurs when a queued work is put on the active queue, * which happens immediately after queueing unless @max_active limit * is reached. */ TRACE_EVENT(workqueue_activate_work, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p function=%ps ", __entry->work, __entry->function) ); /** * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_start, TP_PROTO(struct work_struct *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** * workqueue_execute_end - called immediately after the workqueue callback * @work: pointer to struct work_struct * @function: pointer to worker function * * Allows to track workqueue execution. */ TRACE_EVENT(workqueue_execute_end, TP_PROTO(struct work_struct *work, work_func_t function), TP_ARGS(work, function), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = function; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); #endif /* _TRACE_WORKQUEUE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
21 665 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); neigh_confirm(n); rcu_read_unlock(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 // SPDX-License-Identifier: GPL-2.0 /* RTT/RTO calculation. * * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) * * https://tools.ietf.org/html/rfc6298 * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf */ #include <linux/net.h> #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) #define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_call *call) { return 200; } static u32 __rxrpc_set_rto(const struct rxrpc_call *call) { return (call->srtt_us >> 3) + call->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { return clamp(200000, rto + 100000, RXRPC_RTO_MAX); } /* * Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ u32 srtt = call->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (call->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (call->mdev_us >> 2); /* similar update on mdev */ } call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (call->mdev_us > call->mdev_max_us) { call->mdev_max_us = call->mdev_us; if (call->mdev_max_us > call->rttvar_us) call->rttvar_us = call->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ call->mdev_us = m << 1; /* make sure rto = 3*rtt */ call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call)); call->mdev_max_us = call->rttvar_us; } call->srtt_us = umax(srtt, 1); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void rxrpc_set_rto(struct rxrpc_call *call) { u32 rto; /* 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ rto = __rxrpc_set_rto(call); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ call->rto_us = rxrpc_bound_rto(rto); } static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024, (u32)rtt_us ? : jiffies_to_usecs(1)); } static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ rxrpc_update_rtt_min(call, resp_time, rtt_us); rxrpc_rtt_estimator(call, rtt_us); rxrpc_set_rto(call); /* Only reset backoff on valid RTT measurement [RFC6298]. */ call->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has * exclusive access to the call RTT data. */ void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; rxrpc_ack_update_rtt(call, resp_time, rtt_us); if (call->rtt_count < 3) call->rtt_count++; call->rtt_taken++; WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8); WRITE_ONCE(call->peer->recent_rto_us, call->rto_us); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, rtt_us, call->srtt_us, call->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans) { u64 timo_us; u32 backoff = READ_ONCE(call->backoff); timo_us = call->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) WRITE_ONCE(call->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; return ns_to_ktime(timo_us * NSEC_PER_USEC); } void rxrpc_call_init_rtt(struct rxrpc_call *call) { call->rtt_last_req = KTIME_MIN; call->rto_us = RXRPC_TIMEOUT_INIT; call->mdev_us = RXRPC_TIMEOUT_INIT; call->backoff = 0; //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U); }
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 // SPDX-License-Identifier: GPL-2.0-only /* * sd.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * * Linux scsi disk driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * Modification history: * - Drew Eckhardt <drew@colorado.edu> original * - Eric Youngdale <eric@andante.org> add scatter-gather, multiple * outstanding request, and other enhancements. * Support loadable low-level scsi drivers. * - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using * eight major numbers. * - Richard Gooch <rgooch@atnf.csiro.au> support devfs. * - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in * sd_init and cleanups. * - Alex Davis <letmein@erols.com> Fix problem where partition info * not being read in sd_open. Fix problem where removable media * could be ejected after sd_open. * - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x * - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox * <willy@debian.org>, Kurt Garloff <garloff@suse.de>: * Support 32k/1M disks. * * Logging policy (needs CONFIG_SCSI_LOGGING defined): * - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2 * - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1 * - entering sd_ioctl: SCSI_LOG_IOCTL level 1 * - entering other commands: SCSI_LOG_HLQUEUE level 3 * Note: when the logging level is set by the user, it must be greater * than the level indicated above to trigger output. */ #include <linux/bio-integrity.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/hdreg.h> #include <linux/errno.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/blk-pm.h> #include <linux/delay.h> #include <linux/rw_hint.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/string_helpers.h> #include <linux/slab.h> #include <linux/sed-opal.h> #include <linux/pm_runtime.h> #include <linux/pr.h> #include <linux/t10-pi.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_devinfo.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsicam.h> #include <scsi/scsi_common.h> #include "sd.h" #include "scsi_priv.h" #include "scsi_logging.h" MODULE_AUTHOR("Eric Youngdale"); MODULE_DESCRIPTION("SCSI disk (sd) driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR); MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK); MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD); MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC); MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned int mode); static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim); static void sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static void sd_shutdown(struct device *); static void scsi_disk_release(struct device *cdev); static DEFINE_IDA(sd_index_ida); static mempool_t *sd_page_pool; static struct lock_class_key sd_bio_compl_lkclass; static const char *sd_cache_types[] = { "write through", "none", "write back", "write back, no read (daft)" }; static void sd_set_flush_flag(struct scsi_disk *sdkp, struct queue_limits *lim) { if (sdkp->WCE) { lim->features |= BLK_FEAT_WRITE_CACHE; if (sdkp->DPOFUA) lim->features |= BLK_FEAT_FUA; else lim->features &= ~BLK_FEAT_FUA; } else { lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); } } static ssize_t cache_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ct, rcd, wce, sp; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; char buffer[64]; char *buffer_data; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; static const char temp[] = "temporary "; int len, ret; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) /* no cache control on RBC devices; theoretically they * can do it, but there's probably so many exceptions * it's not worth the risk */ return -EINVAL; if (strncmp(buf, temp, sizeof(temp) - 1) == 0) { buf += sizeof(temp) - 1; sdkp->cache_override = 1; } else { sdkp->cache_override = 0; } ct = sysfs_match_string(sd_cache_types, buf); if (ct < 0) return -EINVAL; rcd = ct & 0x01 ? 1 : 0; wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0; if (sdkp->cache_override) { struct queue_limits lim; sdkp->WCE = wce; sdkp->RCD = rcd; lim = queue_limits_start_update(sdkp->disk->queue); sd_set_flush_flag(sdkp, &lim); ret = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (ret) return ret; return count; } if (scsi_mode_sense(sdp, 0x08, 8, 0, buffer, sizeof(buffer), SD_TIMEOUT, sdkp->max_retries, &data, NULL)) return -EINVAL; len = min_t(size_t, sizeof(buffer), data.length - data.header_length - data.block_descriptor_length); buffer_data = buffer + data.header_length + data.block_descriptor_length; buffer_data[2] &= ~0x05; buffer_data[2] |= wce << 2 | rcd; sp = buffer_data[0] & 0x80 ? 1 : 0; buffer_data[0] &= ~0x80; /* * Ensure WP, DPOFUA, and RESERVED fields are cleared in * received mode parameter buffer before doing MODE SELECT. */ data.device_specific = 0; ret = scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return -EINVAL; } sd_revalidate_disk(sdkp->disk); return count; } static ssize_t manage_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop && sdp->manage_runtime_start_stop && sdp->manage_shutdown); } static DEVICE_ATTR_RO(manage_start_stop); static ssize_t manage_system_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop); } static ssize_t manage_system_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_system_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_system_start_stop); static ssize_t manage_runtime_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_runtime_start_stop); } static ssize_t manage_runtime_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_runtime_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_runtime_start_stop); static ssize_t manage_shutdown_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_shutdown); } static ssize_t manage_shutdown_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_shutdown = v; return count; } static DEVICE_ATTR_RW(manage_shutdown); static ssize_t allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->device->allow_restart); } static ssize_t allow_restart_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { bool v; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; if (kstrtobool(buf, &v)) return -EINVAL; sdp->allow_restart = v; return count; } static DEVICE_ATTR_RW(allow_restart); static ssize_t cache_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); int ct = sdkp->RCD + 2*sdkp->WCE; return sprintf(buf, "%s\n", sd_cache_types[ct]); } static DEVICE_ATTR_RW(cache_type); static ssize_t FUA_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->DPOFUA); } static DEVICE_ATTR_RO(FUA); static ssize_t protection_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->protection_type); } static ssize_t protection_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); unsigned int val; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &val); if (err) return err; if (val <= T10_PI_TYPE3_PROTECTION) sdkp->protection_type = val; return count; } static DEVICE_ATTR_RW(protection_type); static ssize_t protection_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; unsigned int dif, dix; dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); dix = scsi_host_dix_capable(sdp->host, sdkp->protection_type); if (!dix && scsi_host_dix_capable(sdp->host, T10_PI_TYPE0_PROTECTION)) { dif = 0; dix = 1; } if (!dif && !dix) return sprintf(buf, "none\n"); return sprintf(buf, "%s%u\n", dix ? "dix" : "dif", dif); } static DEVICE_ATTR_RO(protection_mode); static ssize_t app_tag_own_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->ATO); } static DEVICE_ATTR_RO(app_tag_own); static ssize_t thin_provisioning_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->lbpme); } static DEVICE_ATTR_RO(thin_provisioning); /* sysfs_match_string() requires dense arrays */ static const char *lbp_mode[] = { [SD_LBP_FULL] = "full", [SD_LBP_UNMAP] = "unmap", [SD_LBP_WS16] = "writesame_16", [SD_LBP_WS10] = "writesame_10", [SD_LBP_ZERO] = "writesame_zero", [SD_LBP_DISABLE] = "disabled", }; static ssize_t provisioning_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", lbp_mode[sdkp->provisioning_mode]); } static ssize_t provisioning_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; struct queue_limits lim; int mode, err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK) return -EINVAL; mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; lim = queue_limits_start_update(sdkp->disk->queue); sd_config_discard(sdkp, &lim, mode); err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (err) return err; return count; } static DEVICE_ATTR_RW(provisioning_mode); /* sysfs_match_string() requires dense arrays */ static const char *zeroing_mode[] = { [SD_ZERO_WRITE] = "write", [SD_ZERO_WS] = "writesame", [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", }; static ssize_t zeroing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", zeroing_mode[sdkp->zeroing_mode]); } static ssize_t zeroing_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int mode; if (!capable(CAP_SYS_ADMIN)) return -EACCES; mode = sysfs_match_string(zeroing_mode, buf); if (mode < 0) return -EINVAL; sdkp->zeroing_mode = mode; return count; } static DEVICE_ATTR_RW(zeroing_mode); static ssize_t max_medium_access_timeouts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_medium_access_timeouts); } static ssize_t max_medium_access_timeouts_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts); return err ? err : count; } static DEVICE_ATTR_RW(max_medium_access_timeouts); static ssize_t max_write_same_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_ws_blocks); } static ssize_t max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; struct queue_limits lim; unsigned long max; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; err = kstrtoul(buf, 10, &max); if (err) return err; if (max == 0) sdp->no_write_same = 1; else if (max <= SD_MAX_WS16_BLOCKS) { sdp->no_write_same = 0; sdkp->max_ws_blocks = max; } lim = queue_limits_start_update(sdkp->disk->queue); sd_config_write_same(sdkp, &lim); err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (err) return err; return count; } static DEVICE_ATTR_RW(max_write_same_blocks); static ssize_t zoned_cap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); if (sdkp->device->type == TYPE_ZBC) return sprintf(buf, "host-managed\n"); if (sdkp->zoned == 1) return sprintf(buf, "host-aware\n"); if (sdkp->zoned == 2) return sprintf(buf, "drive-managed\n"); return sprintf(buf, "none\n"); } static DEVICE_ATTR_RO(zoned_cap); static ssize_t max_retries_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdev = sdkp->device; int retries, err; err = kstrtoint(buf, 10, &retries); if (err) return err; if (retries == SCSI_CMD_RETRIES_NO_LIMIT || retries <= SD_MAX_RETRIES) { sdkp->max_retries = retries; return count; } sdev_printk(KERN_ERR, sdev, "max_retries must be between -1 and %d\n", SD_MAX_RETRIES); return -EINVAL; } static ssize_t max_retries_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%d\n", sdkp->max_retries); } static DEVICE_ATTR_RW(max_retries); static struct attribute *sd_disk_attrs[] = { &dev_attr_cache_type.attr, &dev_attr_FUA.attr, &dev_attr_allow_restart.attr, &dev_attr_manage_start_stop.attr, &dev_attr_manage_system_start_stop.attr, &dev_attr_manage_runtime_start_stop.attr, &dev_attr_manage_shutdown.attr, &dev_attr_protection_type.attr, &dev_attr_protection_mode.attr, &dev_attr_app_tag_own.attr, &dev_attr_thin_provisioning.attr, &dev_attr_provisioning_mode.attr, &dev_attr_zeroing_mode.attr, &dev_attr_max_write_same_blocks.attr, &dev_attr_max_medium_access_timeouts.attr, &dev_attr_zoned_cap.attr, &dev_attr_max_retries.attr, NULL, }; ATTRIBUTE_GROUPS(sd_disk); static struct class sd_disk_class = { .name = "scsi_disk", .dev_release = scsi_disk_release, .dev_groups = sd_disk_groups, }; /* * Don't request a new module, as that could deadlock in multipath * environment. */ static void sd_default_probe(dev_t devt) { } /* * Device no to disk mapping: * * major disc2 disc p1 * |............|.............|....|....| <- dev_t * 31 20 19 8 7 4 3 0 * * Inside a major, we have 16k disks, however mapped non- * contiguously. The first 16 disks are for major0, the next * ones with major1, ... Disk 256 is for major0 again, disk 272 * for major1, ... * As we stay compatible with our numbering scheme, we can reuse * the well-know SCSI majors 8, 65--71, 136--143. */ static int sd_major(int major_idx) { switch (major_idx) { case 0: return SCSI_DISK0_MAJOR; case 1 ... 7: return SCSI_DISK1_MAJOR + major_idx - 1; case 8 ... 15: return SCSI_DISK8_MAJOR + major_idx - 8; default: BUG(); return 0; /* shut up gcc */ } } #ifdef CONFIG_BLK_SED_OPAL static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct scsi_disk *sdkp = data; struct scsi_device *sdev = sdkp->device; u8 cdb[12] = { 0, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; int ret; cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN; cdb[1] = secp; put_unaligned_be16(spsp, &cdb[2]); put_unaligned_be32(len, &cdb[6]); ret = scsi_execute_cmd(sdev, cdb, send ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, buffer, len, SD_TIMEOUT, sdkp->max_retries, &exec_args); return ret <= 0 ? ret : -EIO; } #endif /* CONFIG_BLK_SED_OPAL */ /* * Look up the DIX operation based on whether the command is read or * write and whether dix and dif are enabled. */ static unsigned int sd_prot_op(bool write, bool dix, bool dif) { /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */ static const unsigned int ops[] = { /* wrt dix dif */ SCSI_PROT_NORMAL, /* 0 0 0 */ SCSI_PROT_READ_STRIP, /* 0 0 1 */ SCSI_PROT_READ_INSERT, /* 0 1 0 */ SCSI_PROT_READ_PASS, /* 0 1 1 */ SCSI_PROT_NORMAL, /* 1 0 0 */ SCSI_PROT_WRITE_INSERT, /* 1 0 1 */ SCSI_PROT_WRITE_STRIP, /* 1 1 0 */ SCSI_PROT_WRITE_PASS, /* 1 1 1 */ }; return ops[write << 2 | dix << 1 | dif]; } /* * Returns a mask of the protection flags that are valid for a given DIX * operation. */ static unsigned int sd_prot_flag_mask(unsigned int prot_op) { static const unsigned int flag_mask[] = { [SCSI_PROT_NORMAL] = 0, [SCSI_PROT_READ_STRIP] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_READ_INSERT] = SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_READ_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_INSERT] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_WRITE_STRIP] = SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, }; return flag_mask[prot_op]; } static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, unsigned int dix, unsigned int dif) { struct request *rq = scsi_cmd_to_rq(scmd); struct bio *bio = rq->bio; unsigned int prot_op = sd_prot_op(rq_data_dir(rq), dix, dif); unsigned int protect = 0; if (dix) { /* DIX Type 0, 1, 2, 3 */ if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; if (bio_integrity_flagged(bio, BIP_CHECK_GUARD)) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG)) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } if (dif) { /* DIX/DIF Type 1, 2, 3 */ scmd->prot_flags |= SCSI_PROT_TRANSFER_PI; if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK)) protect = 3 << 5; /* Disable target PI checking */ else protect = 1 << 5; /* Enable target PI checking */ } scsi_set_prot_op(scmd, prot_op); scsi_set_prot_type(scmd, dif); scmd->prot_flags &= sd_prot_flag_mask(prot_op); return protect; } static void sd_disable_discard(struct scsi_disk *sdkp) { sdkp->provisioning_mode = SD_LBP_DISABLE; blk_queue_disable_discard(sdkp->disk->queue); } static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned int mode) { unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; lim->discard_alignment = sdkp->unmap_alignment * logical_block_size; lim->discard_granularity = max(sdkp->physical_block_size, sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { case SD_LBP_FULL: case SD_LBP_DISABLE: break; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS16: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS10: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS); break; case SD_LBP_ZERO: max_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); break; } lim->max_hw_discard_sectors = max_blocks * (logical_block_size >> SECTOR_SHIFT); } static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) { struct page *page; page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!page) return NULL; clear_highpage(page); bvec_set_page(&rq->special_vec, page, data_len, 0); rq->rq_flags |= RQF_SPECIAL_PAYLOAD; return bvec_virt(&rq->special_vec); } static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int data_len = 24; char *buf; buf = sd_set_special_bvec(rq, data_len); if (!buf) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); put_unaligned_be32(nr_blocks, &buf[16]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = SD_TIMEOUT; return scsi_alloc_sgtables(cmd); } static void sd_config_atomic(struct scsi_disk *sdkp, struct queue_limits *lim) { unsigned int logical_block_size = sdkp->device->sector_size, physical_block_size_sectors, max_atomic, unit_min, unit_max; if ((!sdkp->max_atomic && !sdkp->max_atomic_with_boundary) || sdkp->protection_type == T10_PI_TYPE2_PROTECTION) return; physical_block_size_sectors = sdkp->physical_block_size / sdkp->device->sector_size; unit_min = rounddown_pow_of_two(sdkp->atomic_granularity ? sdkp->atomic_granularity : physical_block_size_sectors); /* * Only use atomic boundary when we have the odd scenario of * sdkp->max_atomic == 0, which the spec does permit. */ if (sdkp->max_atomic) { max_atomic = sdkp->max_atomic; unit_max = rounddown_pow_of_two(sdkp->max_atomic); sdkp->use_atomic_write_boundary = 0; } else { max_atomic = sdkp->max_atomic_with_boundary; unit_max = rounddown_pow_of_two(sdkp->max_atomic_boundary); sdkp->use_atomic_write_boundary = 1; } /* * Ensure compliance with granularity and alignment. For now, keep it * simple and just don't support atomic writes for values mismatched * with max_{boundary}atomic, physical block size, and * atomic_granularity itself. * * We're really being distrustful by checking unit_max also... */ if (sdkp->atomic_granularity > 1) { if (unit_min > 1 && unit_min % sdkp->atomic_granularity) return; if (unit_max > 1 && unit_max % sdkp->atomic_granularity) return; } if (sdkp->atomic_alignment > 1) { if (unit_min > 1 && unit_min % sdkp->atomic_alignment) return; if (unit_max > 1 && unit_max % sdkp->atomic_alignment) return; } lim->atomic_write_hw_max = max_atomic * logical_block_size; lim->atomic_write_hw_boundary = 0; lim->atomic_write_hw_unit_min = unit_min * logical_block_size; lim->atomic_write_hw_unit_max = unit_max * logical_block_size; lim->features |= BLK_FEAT_ATOMIC_WRITES; } static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { case SD_ZERO_WS16_UNMAP: return sd_setup_write_same16_cmnd(cmd, true); case SD_ZERO_WS10_UNMAP: return sd_setup_write_same10_cmnd(cmd, true); } } if (sdp->no_write_same) { rq->rq_flags |= RQF_QUIET; return BLK_STS_TARGET; } if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) return sd_setup_write_same16_cmnd(cmd, false); return sd_setup_write_same10_cmnd(cmd, false); } static void sd_disable_write_same(struct scsi_disk *sdkp) { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; blk_queue_disable_write_zeroes(sdkp->disk->queue); } static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim) { unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { sdkp->max_ws_blocks = 0; goto out; } /* Some devices can not handle block counts above 0xffff despite * supporting WRITE SAME(16). Consequently we default to 64k * blocks per I/O unless the device explicitly advertises a * bigger limit. */ if (sdkp->max_ws_blocks > SD_MAX_WS10_BLOCKS) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS16_BLOCKS); else if (sdkp->ws16 || sdkp->ws10 || sdkp->device->no_report_opcodes) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); else { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; } if (sdkp->lbprz && sdkp->lbpws) sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP; else if (sdkp->lbprz && sdkp->lbpws10) sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP; else if (sdkp->max_ws_blocks) sdkp->zeroing_mode = SD_ZERO_WS; else sdkp->zeroing_mode = SD_ZERO_WRITE; if (sdkp->max_ws_blocks && sdkp->physical_block_size > logical_block_size) { /* * Reporting a maximum number of blocks that is not aligned * on the device physical size would cause a large write same * request to be split into physically unaligned chunks by * __blkdev_issue_write_zeroes() even if the caller of this * functions took care to align the large request. So make sure * the maximum reported is aligned to the device physical block * size. This is only an optional optimization for regular * disks, but this is mandatory to avoid failure of large write * same requests directed at sequential write required zones of * host-managed ZBC disks. */ sdkp->max_ws_blocks = round_down(sdkp->max_ws_blocks, bytes_to_logical(sdkp->device, sdkp->physical_block_size)); } out: lim->max_write_zeroes_sectors = sdkp->max_ws_blocks * (logical_block_size >> SECTOR_SHIFT); if (sdkp->zeroing_mode == SD_ZERO_WS16_UNMAP || sdkp->zeroing_mode == SD_ZERO_WS10_UNMAP) lim->max_hw_wzeroes_unmap_sectors = lim->max_write_zeroes_sectors; } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); if (cmd->device->use_16_for_sync) { cmd->cmnd[0] = SYNCHRONIZE_CACHE_16; cmd->cmd_len = 16; } else { cmd->cmnd[0] = SYNCHRONIZE_CACHE; cmd->cmd_len = 10; } cmd->transfersize = 0; cmd->allowed = sdkp->max_retries; rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; return BLK_STS_OK; } /** * sd_group_number() - Compute the GROUP NUMBER field * @cmd: SCSI command for which to compute the value of the six-bit GROUP NUMBER * field. * * From SBC-5 r05 (https://www.t10.org/cgi-bin/ac.pl?t=f&f=sbc5r05.pdf): * 0: no relative lifetime. * 1: shortest relative lifetime. * 2: second shortest relative lifetime. * 3 - 0x3d: intermediate relative lifetimes. * 0x3e: second longest relative lifetime. * 0x3f: longest relative lifetime. */ static u8 sd_group_number(struct scsi_cmnd *cmd) { const struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); if (!sdkp->rscs) return 0; return min3((u32)rq->bio->bi_write_hint, (u32)sdkp->permanent_stream_count, 0x3fu); } static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = SD_EXT_CDB_SIZE; cmd->cmnd[0] = VARIABLE_LENGTH_CMD; cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[7] = 0x18; /* Additional CDB len */ cmd->cmnd[9] = write ? WRITE_32 : READ_32; cmd->cmnd[10] = flags; cmd->cmnd[11] = dld & 0x07; put_unaligned_be64(lba, &cmd->cmnd[12]); put_unaligned_be32(lba, &cmd->cmnd[20]); /* Expected Indirect LBA */ put_unaligned_be32(nr_blocks, &cmd->cmnd[28]); return BLK_STS_OK; } static blk_status_t sd_setup_rw16_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = 16; cmd->cmnd[0] = write ? WRITE_16 : READ_16; cmd->cmnd[1] = flags | ((dld >> 2) & 0x01); cmd->cmnd[14] = ((dld & 0x03) << 6) | sd_group_number(cmd); cmd->cmnd[15] = 0; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); return BLK_STS_OK; } static blk_status_t sd_setup_rw10_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { cmd->cmd_len = 10; cmd->cmnd[0] = write ? WRITE_10 : READ_10; cmd->cmnd[1] = flags; cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[9] = 0; put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); return BLK_STS_OK; } static blk_status_t sd_setup_rw6_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { /* Avoid that 0 blocks gets translated into 256 blocks. */ if (WARN_ON_ONCE(nr_blocks == 0)) return BLK_STS_IOERR; if (unlikely(flags & 0x8)) { /* * This happens only if this drive failed 10byte rw * command with ILLEGAL_REQUEST during operation and * thus turned off use_10_for_rw. */ scmd_printk(KERN_ERR, cmd, "FUA write on READ/WRITE(6) drive\n"); return BLK_STS_IOERR; } cmd->cmd_len = 6; cmd->cmnd[0] = write ? WRITE_6 : READ_6; cmd->cmnd[1] = (lba >> 16) & 0x1f; cmd->cmnd[2] = (lba >> 8) & 0xff; cmd->cmnd[3] = lba & 0xff; cmd->cmnd[4] = nr_blocks; cmd->cmnd[5] = 0; return BLK_STS_OK; } /* * Check if a command has a duration limit set. If it does, and the target * device supports CDL and the feature is enabled, return the limit * descriptor index to use. Return 0 (no limit) otherwise. */ static int sd_cdl_dld(struct scsi_disk *sdkp, struct scsi_cmnd *scmd) { struct scsi_device *sdp = sdkp->device; int hint; if (!sdp->cdl_supported || !sdp->cdl_enable) return 0; /* * Use "no limit" if the request ioprio does not specify a duration * limit hint. */ hint = IOPRIO_PRIO_HINT(req_get_ioprio(scsi_cmd_to_rq(scmd))); if (hint < IOPRIO_HINT_DEV_DURATION_LIMIT_1 || hint > IOPRIO_HINT_DEV_DURATION_LIMIT_7) return 0; return (hint - IOPRIO_HINT_DEV_DURATION_LIMIT_1) + 1; } static blk_status_t sd_setup_atomic_cmnd(struct scsi_cmnd *cmd, sector_t lba, unsigned int nr_blocks, bool boundary, unsigned char flags) { cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_ATOMIC_16; cmd->cmnd[1] = flags; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); if (boundary) put_unaligned_be16(nr_blocks, &cmd->cmnd[10]); else put_unaligned_be16(0, &cmd->cmnd[10]); put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); cmd->cmnd[14] = 0; cmd->cmnd[15] = 0; return BLK_STS_OK; } static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq)); sector_t threshold; unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int mask = logical_to_sectors(sdp, 1) - 1; bool write = rq_data_dir(rq) == WRITE; unsigned char protect, fua; unsigned int dld; blk_status_t ret; unsigned int dif; bool dix; ret = scsi_alloc_sgtables(cmd); if (ret != BLK_STS_OK) return ret; ret = BLK_STS_IOERR; if (!scsi_device_online(sdp) || sdp->changed) { scmd_printk(KERN_ERR, cmd, "device offline or changed\n"); goto fail; } if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); goto fail; } if ((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask)) { scmd_printk(KERN_ERR, cmd, "request not aligned to the logical block size\n"); goto fail; } /* * Some SD card readers can't handle accesses which touch the * last one or two logical blocks. Split accesses as needed. */ threshold = sdkp->capacity - SD_LAST_BUGGY_SECTORS; if (unlikely(sdp->last_sector_bug && lba + nr_blocks > threshold)) { if (lba < threshold) { /* Access up to the threshold but not beyond */ nr_blocks = threshold - lba; } else { /* Access only a single logical block */ nr_blocks = 1; } } fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); dld = sd_cdl_dld(sdkp, cmd); if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else protect = 0; if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if (rq->cmd_flags & REQ_ATOMIC) { ret = sd_setup_atomic_cmnd(cmd, lba, nr_blocks, sdkp->use_atomic_write_boundary, protect | fua); } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) { ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) || sdp->use_10_for_rw || protect || rq->bio->bi_write_hint) { ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks, protect | fua); } else { ret = sd_setup_rw6_cmnd(cmd, write, lba, nr_blocks, protect | fua); } if (unlikely(ret != BLK_STS_OK)) goto fail; /* * We shouldn't disconnect in the middle of a sector, so with a dumb * host adapter, it's safe to assume that we can at least transfer * this many bytes between each connect / disconnect. */ cmd->transfersize = sdp->sector_size; cmd->underflow = nr_blocks << 9; cmd->allowed = sdkp->max_retries; cmd->sdb.length = nr_blocks * sdp->sector_size; SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, cmd, "%s: block=%llu, count=%d\n", __func__, (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, cmd, "%s %d/%u 512 byte blocks.\n", write ? "writing" : "reading", nr_blocks, blk_rq_sectors(rq))); /* * This indicates that the command is ready from our end to be queued. */ return BLK_STS_OK; fail: scsi_free_sgtables(cmd); return ret; } static blk_status_t sd_init_command(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); switch (req_op(rq)) { case REQ_OP_DISCARD: switch (scsi_disk(rq->q->disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: return sd_setup_write_same16_cmnd(cmd, true); case SD_LBP_WS10: return sd_setup_write_same10_cmnd(cmd, true); case SD_LBP_ZERO: return sd_setup_write_same10_cmnd(cmd, false); default: return BLK_STS_TARGET; } case REQ_OP_WRITE_ZEROES: return sd_setup_write_zeroes_cmnd(cmd); case REQ_OP_FLUSH: return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, false); case REQ_OP_ZONE_RESET_ALL: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, true); case REQ_OP_ZONE_OPEN: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false); case REQ_OP_ZONE_CLOSE: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false); case REQ_OP_ZONE_FINISH: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false); default: WARN_ON_ONCE(1); return BLK_STS_NOTSUPP; } } static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = scsi_cmd_to_rq(SCpnt); if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) mempool_free(rq->special_vec.bv_page, sd_page_pool); } static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) { if (sdkp->device->removable || sdkp->write_prot) { if (disk_check_media_change(disk)) return true; } /* * Force a full rescan after ioctl(BLKRRPART). While the disk state has * nothing to do with partitions, BLKRRPART is used to force a full * revalidate after things like a format for historical reasons. */ return test_bit(GD_NEED_PART_SCAN, &disk->state); } /** * sd_open - open a scsi disk device * @disk: disk to open * @mode: open mode * * Returns 0 if successful. Returns a negated errno value in case * of error. * * Note: This can be called from a user context (e.g. fsck(1) ) * or from within the kernel (e.g. as a result of a mount(1) ). * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * * Locking: called with disk->open_mutex held. **/ static int sd_open(struct gendisk *disk, blk_mode_t mode) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; int retval; if (scsi_device_get(sdev)) return -ENXIO; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n")); /* * If the device is in error recovery, wait until it is done. * If the device is offline, then disallow any access to it. */ retval = -ENXIO; if (!scsi_block_when_processing_errors(sdev)) goto error_out; if (sd_need_revalidate(disk, sdkp)) sd_revalidate_disk(disk); /* * If the drive is empty, just let the open fail. */ retval = -ENOMEDIUM; if (sdev->removable && !sdkp->media_present && !(mode & BLK_OPEN_NDELAY)) goto error_out; /* * If the device has the write protect tab set, have the open fail * if the user expects to be able to write to the thing. */ retval = -EROFS; if (sdkp->write_prot && (mode & BLK_OPEN_WRITE)) goto error_out; /* * It is possible that the disk changing stuff resulted in * the device being taken offline. If this is the case, * report this to the user, and don't pretend that the * open actually succeeded. */ retval = -ENXIO; if (!scsi_device_online(sdev)) goto error_out; if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); } return 0; error_out: scsi_device_put(sdev); return retval; } /** * sd_release - invoked when the (last) close(2) is called on this * scsi disk. * @disk: disk to release * * Returns 0. * * Note: may block (uninterruptible) if error recovery is underway * on this disk. * * Locking: called with disk->open_mutex held. **/ static void sd_release(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n")); if (atomic_dec_return(&sdkp->openers) == 0 && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); } scsi_device_put(sdev); } static int sd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); int diskinfo[4]; /* default to most commonly used values */ diskinfo[0] = 0x40; /* 1 << 6 */ diskinfo[1] = 0x20; /* 1 << 5 */ diskinfo[2] = capacity >> 11; /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) host->hostt->bios_param(sdp, disk, capacity, diskinfo); else scsicam_bios_param(disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; geo->cylinders = diskinfo[2]; return 0; } /** * sd_ioctl - process an ioctl * @bdev: target block device * @mode: open mode * @cmd: ioctl command number * @arg: this is third argument given to ioctl(2) system call. * Often contains a pointer. * * Returns 0 if successful (some ioctls return positive numbers on * success as well). Returns a negated errno value in case of error. * * Note: most ioctls are forward onto the block subsystem or further * down in the scsi subsystem. **/ static int sd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; void __user *p = (void __user *)arg; int error; SCSI_LOG_IOCTL(1, sd_printk(KERN_INFO, sdkp, "sd_ioctl: disk=%s, " "cmd=0x%x\n", disk->disk_name, cmd)); if (bdev_is_partition(bdev) && !capable(CAP_SYS_RAWIO)) return -ENOIOCTLCMD; /* * If we are in the middle of error recovery, don't let anyone * else try and use this device. Also, if error recovery fails, it * may try and take the device offline, in which case all further * access to the device is prohibited. */ error = scsi_ioctl_block_when_processing_errors(sdp, cmd, (mode & BLK_OPEN_NDELAY)); if (error) return error; if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); return scsi_ioctl(sdp, mode & BLK_OPEN_WRITE, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) { if (sdkp->media_present) sdkp->device->changed = 1; if (sdkp->device->removable) { sdkp->media_present = 0; sdkp->capacity = 0; } } static int media_not_present(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { if (!scsi_sense_valid(sshdr)) return 0; /* not invoked for commands that could return deferred errors */ switch (sshdr->sense_key) { case UNIT_ATTENTION: case NOT_READY: /* medium not present */ if (sshdr->asc == 0x3A) { set_media_not_present(sdkp); return 1; } } return 0; } /** * sd_check_events - check media events * @disk: kernel device descriptor * @clearing: disk events currently being cleared * * Returns mask of DISK_EVENT_*. * * Note: this function is invoked from the block subsystem. **/ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing) { struct scsi_disk *sdkp = disk->private_data; struct scsi_device *sdp; int retval; bool disk_changed; if (!sdkp) return 0; sdp = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n")); /* * If the device is offline, don't send any commands - just pretend as * if the command failed. If the device ever comes back online, we * can deal with it then. It is only because of unrecoverable errors * that we would ever take a device offline in the first place. */ if (!scsi_device_online(sdp)) { set_media_not_present(sdkp); goto out; } /* * Using TEST_UNIT_READY enables differentiation between drive with * no cartridge loaded - NOT READY, drive with changed cartridge - * UNIT ATTENTION, or with same cartridge - GOOD STATUS. * * Drives that auto spin down. eg iomega jaz 1G, will be started * by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever * sd_revalidate() is called. */ if (scsi_block_when_processing_errors(sdp)) { struct scsi_sense_hdr sshdr = { 0, }; retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, sdkp->max_retries, &sshdr); /* failed to execute TUR, assume media not present */ if (retval < 0 || host_byte(retval)) { set_media_not_present(sdkp); goto out; } if (media_not_present(sdkp, &sshdr)) goto out; } /* * For removable scsi disk we have to recognise the presence * of a disk in the drive. */ if (!sdkp->media_present) sdp->changed = 1; sdkp->media_present = 1; out: /* * sdp->changed is set under the following conditions: * * Medium present state has changed in either direction. * Device has indicated UNIT_ATTENTION. */ disk_changed = sdp->changed; sdp->changed = 0; return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0; } static int sd_sync_cache(struct scsi_disk *sdkp) { int res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; /* Leave the rest of the command zero to indicate flush everything. */ const unsigned char cmd[16] = { sdp->use_16_for_sync ? SYNCHRONIZE_CACHE_16 : SYNCHRONIZE_CACHE }; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { .allowed = 3, .result = SCMD_FAILURE_RESULT_ANY, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, .sshdr = &sshdr, .failures = &failures, }; if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, timeout, sdkp->max_retries, &exec_args); if (res) { sd_print_result(sdkp, "Synchronize Cache(10) failed", res); if (res < 0) return res; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* we need to evaluate the error return */ if (sshdr.asc == 0x3a || /* medium not present */ sshdr.asc == 0x20 || /* invalid command */ (sshdr.asc == 0x74 && sshdr.ascq == 0x71)) /* drive is password locked */ /* this is no error here */ return 0; /* * If a format is in progress or if the drive does not * support sync, there is not much we can do because * this is called during shutdown or suspend so just * return success so those operations can proceed. */ if ((sshdr.asc == 0x04 && sshdr.ascq == 0x04) || sshdr.sense_key == ILLEGAL_REQUEST) return 0; } switch (host_byte(res)) { /* ignore errors due to racing a disconnection */ case DID_BAD_TARGET: case DID_NO_CONNECT: return 0; /* signal the upper layer it might try again */ case DID_BUS_BUSY: case DID_IMM_RETRY: case DID_REQUEUE: case DID_SOFT_ERROR: return -EBUSY; default: return -EIO; } } return 0; } static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_revalidate_disk(sdkp->disk); } static int sd_get_unique_id(struct gendisk *disk, u8 id[16], enum blk_unique_id type) { struct scsi_device *sdev = scsi_disk(disk)->device; const struct scsi_vpd *vpd; const unsigned char *d; int ret = -ENXIO, len; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg83); if (!vpd) goto out_unlock; ret = -EINVAL; for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) { /* we only care about designators with LU association */ if (((d[1] >> 4) & 0x3) != 0x00) continue; if ((d[1] & 0xf) != type) continue; /* * Only exit early if a 16-byte descriptor was found. Otherwise * keep looking as one with more entropy might still show up. */ len = d[3]; if (len != 8 && len != 12 && len != 16) continue; ret = len; memcpy(id, d + 4, len); if (len == 16) break; } out_unlock: rcu_read_unlock(); return ret; } static int sd_scsi_to_pr_err(struct scsi_sense_hdr *sshdr, int result) { switch (host_byte(result)) { case DID_TRANSPORT_MARGINAL: case DID_TRANSPORT_DISRUPTED: case DID_BUS_BUSY: return PR_STS_RETRY_PATH_FAILURE; case DID_NO_CONNECT: return PR_STS_PATH_FAILED; case DID_TRANSPORT_FAILFAST: return PR_STS_PATH_FAST_FAILED; } switch (status_byte(result)) { case SAM_STAT_RESERVATION_CONFLICT: return PR_STS_RESERVATION_CONFLICT; case SAM_STAT_CHECK_CONDITION: if (!scsi_sense_valid(sshdr)) return PR_STS_IOERR; if (sshdr->sense_key == ILLEGAL_REQUEST && (sshdr->asc == 0x26 || sshdr->asc == 0x24)) return -EINVAL; fallthrough; default: return PR_STS_IOERR; } } static int sd_pr_in_command(struct block_device *bdev, u8 sa, unsigned char *data, int data_len) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; u8 cmd[10] = { PERSISTENT_RESERVE_IN, sa }; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = 5, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int result; put_unaligned_be16(data_len, &cmd[7]); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, data, data_len, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_read_keys(struct block_device *bdev, struct pr_keys *keys_info) { int result, i, data_offset, num_copy_keys; u32 num_keys = keys_info->num_keys; int data_len = num_keys * 8 + 8; u8 *data; data = kzalloc(data_len, GFP_KERNEL); if (!data) return -ENOMEM; result = sd_pr_in_command(bdev, READ_KEYS, data, data_len); if (result) goto free_data; keys_info->generation = get_unaligned_be32(&data[0]); keys_info->num_keys = get_unaligned_be32(&data[4]) / 8; data_offset = 8; num_copy_keys = min(num_keys, keys_info->num_keys); for (i = 0; i < num_copy_keys; i++) { keys_info->keys[i] = get_unaligned_be64(&data[data_offset]); data_offset += 8; } free_data: kfree(data); return result; } static int sd_pr_read_reservation(struct block_device *bdev, struct pr_held_reservation *rsv) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; u8 data[24] = { }; int result, len; result = sd_pr_in_command(bdev, READ_RESERVATION, data, sizeof(data)); if (result) return result; len = get_unaligned_be32(&data[4]); if (!len) return 0; /* Make sure we have at least the key and type */ if (len < 14) { sdev_printk(KERN_INFO, sdev, "READ RESERVATION failed due to short return buffer of %d bytes\n", len); return -EINVAL; } rsv->generation = get_unaligned_be32(&data[0]); rsv->key = get_unaligned_be64(&data[8]); rsv->type = scsi_pr_type_to_block(data[21] & 0x0f); return 0; } static int sd_pr_out_command(struct block_device *bdev, u8 sa, u64 key, u64 sa_key, enum scsi_pr_type type, u8 flags) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = 5, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int result; u8 cmd[16] = { 0, }; u8 data[24] = { 0, }; cmd[0] = PERSISTENT_RESERVE_OUT; cmd[1] = sa; cmd[2] = type; put_unaligned_be32(sizeof(data), &cmd[5]); put_unaligned_be64(key, &data[0]); put_unaligned_be64(sa_key, &data[8]); data[20] = flags; result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, &data, sizeof(data), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, u32 flags) { if (flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return sd_pr_out_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00, old_key, new_key, 0, (1 << 0) /* APTPL */); } static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { if (flags) return -EOPNOTSUPP; return sd_pr_out_command(bdev, 0x01, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { return sd_pr_out_command(bdev, 0x02, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { return sd_pr_out_command(bdev, abort ? 0x05 : 0x04, old_key, new_key, block_pr_type_to_scsi(type), 0); } static int sd_pr_clear(struct block_device *bdev, u64 key) { return sd_pr_out_command(bdev, 0x03, key, 0, 0, 0); } static const struct pr_ops sd_pr_ops = { .pr_register = sd_pr_register, .pr_reserve = sd_pr_reserve, .pr_release = sd_pr_release, .pr_preempt = sd_pr_preempt, .pr_clear = sd_pr_clear, .pr_read_keys = sd_pr_read_keys, .pr_read_reservation = sd_pr_read_reservation, }; static void scsi_disk_free_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); put_device(&sdkp->disk_dev); } static const struct block_device_operations sd_fops = { .owner = THIS_MODULE, .open = sd_open, .release = sd_release, .ioctl = sd_ioctl, .getgeo = sd_getgeo, .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = sd_check_events, .unlock_native_capacity = sd_unlock_native_capacity, .report_zones = sd_zbc_report_zones, .get_unique_id = sd_get_unique_id, .free_disk = scsi_disk_free_disk, .pr_ops = &sd_pr_ops, }; /** * sd_eh_reset - reset error handling callback * @scmd: sd-issued command that has failed * * This function is called by the SCSI midlayer before starting * SCSI EH. When counting medium access failures we have to be * careful to register it only only once per device and SCSI EH run; * there might be several timed out commands which will cause the * 'max_medium_access_timeouts' counter to trigger after the first * SCSI EH run already and set the device to offline. * So this function resets the internal counter before starting SCSI EH. **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; } /** * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer * * This function is called by the SCSI midlayer upon completion of an * error test command (currently TEST UNIT READY). The result of sending * the eh command is passed in eh_disp. We're looking for devices that * fail medium access commands but are OK with non access commands like * test unit ready (so wrongly see the device as having a successful * recovery) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || !scsi_medium_access_command(scmd) || host_byte(scmd->result) != DID_TIME_OUT || eh_disp != SUCCESS) return eh_disp; /* * The device has timed out executing a medium access command. * However, the TEST UNIT READY command sent during error * handling completed successfully. Either the device is in the * process of recovering or has it suffered an internal failure * that prevents access to the storage medium. */ if (!sdkp->ignore_medium_access_errors) { sdkp->medium_access_timed_out++; sdkp->ignore_medium_access_errors = true; } /* * If the device keeps failing read/write commands but TEST UNIT * READY always completes successfully we assume that medium * access is no longer possible and take the device offline. */ if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) { scmd_printk(KERN_ERR, scmd, "Medium access timeout failure. Offlining disk!\n"); mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); return SUCCESS; } return eh_disp; } static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); struct scsi_device *sdev = scmd->device; unsigned int transferred, good_bytes; u64 start_lba, end_lba, bad_lba; /* * Some commands have a payload smaller than the device logical * block size (e.g. INQUIRY on a 4K disk). */ if (scsi_bufflen(scmd) <= sdev->sector_size) return 0; /* Check if we have a 'bad_lba' information */ if (!scsi_get_sense_info_fld(scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, &bad_lba)) return 0; /* * If the bad lba was reported incorrectly, we have no idea where * the error is. */ start_lba = sectors_to_logical(sdev, blk_rq_pos(req)); end_lba = start_lba + bytes_to_logical(sdev, scsi_bufflen(scmd)); if (bad_lba < start_lba || bad_lba >= end_lba) return 0; /* * resid is optional but mostly filled in. When it's unused, * its value is zero, so we assume the whole buffer transferred */ transferred = scsi_bufflen(scmd) - scsi_get_resid(scmd); /* This computation should always be done in terms of the * resolution of the device's medium. */ good_bytes = logical_to_bytes(sdev, bad_lba - start_lba); return min(good_bytes, transferred); } /** * sd_done - bottom half handler: called when the lower level * driver has completed (successfully or otherwise) a scsi command. * @SCpnt: mid-level's per command structure. * * Note: potentially run from within an ISR. Must not block. **/ static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt); unsigned int sector_size = SCpnt->device->sector_size; unsigned int resid; struct scsi_sense_hdr sshdr; struct request *req = scsi_cmd_to_rq(SCpnt); struct scsi_disk *sdkp = scsi_disk(req->q->disk); int sense_valid = 0; int sense_deferred = 0; switch (req_op(req)) { case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: if (!result) { good_bytes = blk_rq_bytes(req); scsi_set_resid(SCpnt, 0); } else { good_bytes = 0; scsi_set_resid(SCpnt, blk_rq_bytes(req)); } break; default: /* * In case of bogus fw or device, we could end up having * an unaligned partial completion. Check this here and force * alignment. */ resid = scsi_get_resid(SCpnt); if (resid & (sector_size - 1)) { sd_printk(KERN_INFO, sdkp, "Unaligned partial completion (resid=%u, sector_sz=%u)\n", resid, sector_size); scsi_print_command(SCpnt); resid = min(scsi_bufflen(SCpnt), round_up(resid, sector_size)); scsi_set_resid(SCpnt, resid); } } if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) sense_deferred = scsi_sense_is_deferred(&sshdr); } sdkp->medium_access_timed_out = 0; if (!scsi_status_is_check_condition(result) && (!sense_valid || sense_deferred)) goto out; switch (sshdr.sense_key) { case HARDWARE_ERROR: case MEDIUM_ERROR: good_bytes = sd_completed_bytes(SCpnt); break; case RECOVERED_ERROR: good_bytes = scsi_bufflen(SCpnt); break; case NO_SENSE: /* This indicates a false check condition, so ignore it. An * unknown amount of data was transferred so treat it as an * error. */ SCpnt->result = 0; memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); break; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF: Target detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case ILLEGAL_REQUEST: switch (sshdr.asc) { case 0x10: /* DIX: Host detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case 0x20: /* INVALID COMMAND OPCODE */ case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: sd_disable_discard(sdkp); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_disable_discard(sdkp); } else { sd_disable_write_same(sdkp); req->rq_flags |= RQF_QUIET; } break; } } break; default: break; } out: if (sdkp->device->type == TYPE_ZBC) good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); return good_bytes; } /* * spinup disk - called only in sd_revalidate_disk() */ static void sd_spinup_disk(struct scsi_disk *sdkp) { static const u8 cmd[10] = { TEST_UNIT_READY }; unsigned long spintime_expire = 0; int spintime, sense_valid = 0; unsigned int the_result; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { /* Do not retry Medium Not Present */ { .sense = UNIT_ATTENTION, .asc = 0x3A, .ascq = SCMD_FAILURE_ASCQ_ANY, .result = SAM_STAT_CHECK_CONDITION, }, { .sense = NOT_READY, .asc = 0x3A, .ascq = SCMD_FAILURE_ASCQ_ANY, .result = SAM_STAT_CHECK_CONDITION, }, /* Retry when scsi_status_is_good would return false 3 times */ { .result = SCMD_FAILURE_STAT_ANY, .allowed = 3, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; spintime = 0; /* Spin up drives, as required. Only do this at boot time */ /* Spinup needs to be done for module loads too. */ do { bool media_was_present = sdkp->media_present; scsi_failures_reset_retries(&failures); the_result = scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { /* * If the drive has indicated to us that it doesn't * have any media in it, don't bother with any more * polling. */ if (media_not_present(sdkp, &sshdr)) { if (media_was_present) sd_printk(KERN_NOTICE, sdkp, "Media removed, stopped polling\n"); return; } sense_valid = scsi_sense_valid(&sshdr); } if (!scsi_status_is_check_condition(the_result)) { /* no sense, TUR either succeeded or failed * with a status error */ if(!spintime && !scsi_status_is_good(the_result)) { sd_print_result(sdkp, "Test Unit Ready failed", the_result); } break; } /* * The device does not want the automatic start to be issued. */ if (sdkp->device->no_start_on_add) break; if (sense_valid && sshdr.sense_key == NOT_READY) { if (sshdr.asc == 4 && sshdr.ascq == 3) break; /* manual intervention required */ if (sshdr.asc == 4 && sshdr.ascq == 0xb) break; /* standby */ if (sshdr.asc == 4 && sshdr.ascq == 0xc) break; /* unavailable */ if (sshdr.asc == 4 && sshdr.ascq == 0x1b) break; /* sanitize in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x24) break; /* depopulation in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x25) break; /* depopulation restoration in progress */ /* * Issue command to spin up drive when not ready */ if (!spintime) { /* Return immediately and start spin cycle */ const u8 start_cmd[10] = { [0] = START_STOP, [1] = 1, [4] = sdkp->device->start_stop_pwr_cond ? 0x11 : 1, }; sd_printk(KERN_NOTICE, sdkp, "Spinning up disk..."); scsi_execute_cmd(sdkp->device, start_cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); spintime_expire = jiffies + 100 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); printk(KERN_CONT "."); /* * Wait for USB flash devices with slow firmware. * Yes, this sense key/ASC combination shouldn't * occur here. It's characteristic of these devices. */ } else if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x28) { if (!spintime) { spintime_expire = jiffies + 5 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); } else { /* we don't understand the sense code, so it's * probably pointless to loop */ if(!spintime) { sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n"); sd_print_sense_hdr(sdkp, &sshdr); } break; } } while (spintime && time_before_eq(jiffies, spintime_expire)); if (spintime) { if (scsi_status_is_good(the_result)) printk(KERN_CONT "ready\n"); else printk(KERN_CONT "not responding...\n"); } } /* * Determine whether disk supports Data Integrity Field. */ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; u8 type; if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) { sdkp->protection_type = 0; return 0; } type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */ if (type > T10_PI_TYPE3_PROTECTION) { sd_printk(KERN_ERR, sdkp, "formatted with unsupported" \ " protection type %u. Disabling disk!\n", type); sdkp->protection_type = 0; return -ENODEV; } sdkp->protection_type = type; return 0; } static void sd_config_protection(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) sd_dif_config_host(sdkp, lim); if (!sdkp->protection_type) return; if (!scsi_host_dif_capable(sdp->host, sdkp->protection_type)) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling DIF Type %u protection\n", sdkp->protection_type); sdkp->protection_type = 0; } sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIF Type %u protection\n", sdkp->protection_type); } static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, struct scsi_sense_hdr *sshdr, int sense_valid, int the_result) { if (sense_valid) sd_print_sense_hdr(sdkp, sshdr); else sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n"); /* * Set dirty bit for removable devices if not ready - * sometimes drives will not report this properly. */ if (sdp->removable && sense_valid && sshdr->sense_key == NOT_READY) set_media_not_present(sdkp); /* * We used to set media_present to 0 here to indicate no media * in the drive, but some drives fail read capacity even with * media present, so we can't do that. */ sdkp->capacity = 0; /* unknown mapped to zero - as usual */ } #define RC16_LEN 32 #if RC16_LEN > SD_BUF_SIZE #error RC16_LEN must not be more than SD_BUF_SIZE #endif #define READ_CAPACITY_RETRIES_ON_RESET 10 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, struct queue_limits *lim, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int sense_valid = 0; int the_result; int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET; unsigned int alignment; unsigned long long lba; unsigned sector_size; if (sdp->no_read_capacity_16) return -EINVAL; do { memset(cmd, 0, 16); cmd[0] = SERVICE_ACTION_IN_16; cmd[1] = SAI_READ_CAPACITY_16; cmd[13] = RC16_LEN; memset(buffer, 0, RC16_LEN); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, RC16_LEN, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { if (media_not_present(sdkp, &sshdr)) return -ENODEV; sense_valid = scsi_sense_valid(&sshdr); if (sense_valid && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) /* Invalid Command Operation Code or * Invalid Field in CDB, just retry * silently with RC10 */ return -EINVAL; if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29 && sshdr.ascq == 0x00) /* Device reset might occur several times, * give it one more chance */ if (--reset_retries > 0) continue; } retries--; } while (the_result && retries); if (the_result) { sd_print_result(sdkp, "Read Capacity(16) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[8]); lba = get_unaligned_be64(&buffer[0]); if (sd_read_protection_type(sdkp, buffer) < 0) { sdkp->capacity = 0; return -ENODEV; } /* Logical blocks per physical block exponent */ sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size; /* RC basis */ sdkp->rc_basis = (buffer[12] >> 4) & 0x3; /* Lowest aligned logical block */ alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size; lim->alignment_offset = alignment; if (alignment && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "physical block alignment offset: %u\n", alignment); if (buffer[14] & 0x80) { /* LBPME */ sdkp->lbpme = 1; if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; } sdkp->capacity = lba + 1; return sector_size; } static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp, unsigned char *buffer) { static const u8 cmd[10] = { READ_CAPACITY }; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { /* Do not retry Medium Not Present */ { .sense = UNIT_ATTENTION, .asc = 0x3A, .result = SAM_STAT_CHECK_CONDITION, }, { .sense = NOT_READY, .asc = 0x3A, .result = SAM_STAT_CHECK_CONDITION, }, /* Device reset might occur several times so retry a lot */ { .sense = UNIT_ATTENTION, .asc = 0x29, .allowed = READ_CAPACITY_RETRIES_ON_RESET, .result = SAM_STAT_CHECK_CONDITION, }, /* Any other error not listed above retry 3 times */ { .result = SCMD_FAILURE_RESULT_ANY, .allowed = 3, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int sense_valid = 0; int the_result; sector_t lba; unsigned sector_size; memset(buffer, 0, 8); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, 8, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { sense_valid = scsi_sense_valid(&sshdr); if (media_not_present(sdkp, &sshdr)) return -ENODEV; } if (the_result) { sd_print_result(sdkp, "Read Capacity(10) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[4]); lba = get_unaligned_be32(&buffer[0]); if (sdp->no_read_capacity_16 && (lba == 0xffffffff)) { /* Some buggy (usb cardreader) devices return an lba of 0xffffffff when the want to report a size of 0 (with which they really mean no media is present) */ sdkp->capacity = 0; sdkp->physical_block_size = sector_size; return sector_size; } sdkp->capacity = lba + 1; sdkp->physical_block_size = sector_size; return sector_size; } static int sd_try_rc16_first(struct scsi_device *sdp) { if (sdp->host->max_cmd_len < 16) return 0; if (sdp->try_rc_10_first) return 0; if (sdp->scsi_level > SCSI_SPC_2) return 1; if (scsi_device_protection(sdp)) return 1; return 0; } /* * read disk capacity */ static void sd_read_capacity(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned char *buffer) { int sector_size; struct scsi_device *sdp = sdkp->device; if (sd_try_rc16_first(sdp)) { sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size == -ENODEV) return; if (sector_size < 0) sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size < 0) return; } else { sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size < 0) return; if ((sizeof(sdkp->capacity) > 4) && (sdkp->capacity > 0xffffffffULL)) { int old_sector_size = sector_size; sd_printk(KERN_NOTICE, sdkp, "Very big device. " "Trying to use READ CAPACITY(16).\n"); sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size < 0) { sd_printk(KERN_NOTICE, sdkp, "Using 0xffffffff as device size\n"); sdkp->capacity = 1 + (sector_t) 0xffffffff; sector_size = old_sector_size; goto got_data; } /* Remember that READ CAPACITY(16) succeeded */ sdp->try_rc_10_first = 0; } } /* Some devices are known to return the total number of blocks, * not the highest block number. Some devices have versions * which do this and others which do not. Some devices we might * suspect of doing this but we don't know for certain. * * If we know the reported capacity is wrong, decrement it. If * we can only guess, then assume the number of blocks is even * (usually true but not always) and err on the side of lowering * the capacity. */ if (sdp->fix_capacity || (sdp->guess_capacity && (sdkp->capacity & 0x01))) { sd_printk(KERN_INFO, sdkp, "Adjusting the sector count " "from its reported value: %llu\n", (unsigned long long) sdkp->capacity); --sdkp->capacity; } got_data: if (sector_size == 0) { sector_size = 512; sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, " "assuming 512.\n"); } if (sector_size != 512 && sector_size != 1024 && sector_size != 2048 && sector_size != 4096) { sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n", sector_size); /* * The user might want to re-format the drive with * a supported sectorsize. Once this happens, it * would be relatively trivial to set the thing up. * For this reason, we leave the thing in the table. */ sdkp->capacity = 0; /* * set a bogus sector size so the normal read/write * logic in the block layer will eventually refuse any * request on this device without tripping over power * of two sector size assumptions */ sector_size = 512; } lim->logical_block_size = sector_size; lim->physical_block_size = sdkp->physical_block_size; sdkp->device->sector_size = sector_size; if (sdkp->capacity > 0xffffffff) sdp->use_16_for_rw = 1; } /* * Print disk capacity */ static void sd_print_capacity(struct scsi_disk *sdkp, sector_t old_capacity) { int sector_size = sdkp->device->sector_size; char cap_str_2[10], cap_str_10[10]; if (!sdkp->first_scan && old_capacity == sdkp->capacity) return; string_get_size(sdkp->capacity, sector_size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); string_get_size(sdkp->capacity, sector_size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); sd_printk(KERN_NOTICE, sdkp, "%llu %d-byte logical blocks: (%s/%s)\n", (unsigned long long)sdkp->capacity, sector_size, cap_str_10, cap_str_2); if (sdkp->physical_block_size != sector_size) sd_printk(KERN_NOTICE, sdkp, "%u-byte physical blocks\n", sdkp->physical_block_size); } /* called with buffer of length 512 */ static inline int sd_do_mode_sense(struct scsi_disk *sdkp, int dbd, int modepage, unsigned char *buffer, int len, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { /* * If we must use MODE SENSE(10), make sure that the buffer length * is at least 8 bytes so that the mode sense header fits. */ if (sdkp->device->use_10_for_ms && len < 8) len = 8; return scsi_mode_sense(sdkp->device, dbd, modepage, 0, buffer, len, SD_TIMEOUT, sdkp->max_retries, data, sshdr); } /* * read write protect setting, if possible - called only in sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_write_protect_flag(struct scsi_disk *sdkp, unsigned char *buffer) { int res; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; int old_wp = sdkp->write_prot; set_disk_ro(sdkp->disk, 0); if (sdp->skip_ms_page_3f) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n"); return; } if (sdp->use_192_bytes_for_3f) { res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 192, &data, NULL); } else { /* * First attempt: ask for all pages (0x3F), but only 4 bytes. * We have to start carefully: some devices hang if we ask * for more than is available. */ res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 4, &data, NULL); /* * Second attempt: ask for page 0 When only page 0 is * implemented, a request for page 3F may return Sense Key * 5: Illegal Request, Sense Code 24: Invalid field in * CDB. */ if (res < 0) res = sd_do_mode_sense(sdkp, 0, 0, buffer, 4, &data, NULL); /* * Third attempt: ask 255 bytes, as we did earlier. */ if (res < 0) res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 255, &data, NULL); } if (res < 0) { sd_first_printk(KERN_WARNING, sdkp, "Test WP failed, assume Write Enabled\n"); } else { sdkp->write_prot = ((data.device_specific & 0x80) != 0); set_disk_ro(sdkp->disk, sdkp->write_prot); if (sdkp->first_scan || old_wp != sdkp->write_prot) { sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n", sdkp->write_prot ? "on" : "off"); sd_printk(KERN_DEBUG, sdkp, "Mode Sense: %4ph\n", buffer); } } } /* * sd_read_cache_type - called only from sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_cache_type(struct scsi_disk *sdkp, unsigned char *buffer) { int len = 0, res; struct scsi_device *sdp = sdkp->device; int dbd; int modepage; int first_len; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; int old_wce = sdkp->WCE; int old_rcd = sdkp->RCD; int old_dpofua = sdkp->DPOFUA; if (sdkp->cache_override) return; first_len = 4; if (sdp->skip_ms_page_8) { if (sdp->type == TYPE_RBC) goto defaults; else { if (sdp->skip_ms_page_3f) goto defaults; modepage = 0x3F; if (sdp->use_192_bytes_for_3f) first_len = 192; dbd = 0; } } else if (sdp->type == TYPE_RBC) { modepage = 6; dbd = 8; } else { modepage = 8; dbd = 0; } /* cautiously ask */ res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, first_len, &data, &sshdr); if (res < 0) goto bad_sense; if (!data.header_length) { modepage = 6; first_len = 0; sd_first_printk(KERN_ERR, sdkp, "Missing header in MODE_SENSE response\n"); } /* that went OK, now ask for the proper length */ len = data.length; /* * We're only interested in the first three bytes, actually. * But the data cache page is defined for the first 20. */ if (len < 3) goto bad_sense; else if (len > SD_BUF_SIZE) { sd_first_printk(KERN_NOTICE, sdkp, "Truncating mode parameter " "data from %d to %d bytes\n", len, SD_BUF_SIZE); len = SD_BUF_SIZE; } if (modepage == 0x3F && sdp->use_192_bytes_for_3f) len = 192; /* Get the data */ if (len > first_len) res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, len, &data, &sshdr); if (!res) { int offset = data.header_length + data.block_descriptor_length; while (offset < len) { u8 page_code = buffer[offset] & 0x3F; u8 spf = buffer[offset] & 0x40; if (page_code == 8 || page_code == 6) { /* We're interested only in the first 3 bytes. */ if (len - offset <= 2) { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode parameter " "data\n"); goto defaults; } else { modepage = page_code; goto Page_found; } } else { /* Go to the next page */ if (spf && len - offset > 3) offset += 4 + (buffer[offset+2] << 8) + buffer[offset+3]; else if (!spf && len - offset > 1) offset += 2 + buffer[offset+1]; else { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode " "parameter data\n"); goto defaults; } } } sd_first_printk(KERN_WARNING, sdkp, "No Caching mode page found\n"); goto defaults; Page_found: if (modepage == 8) { sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0); sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0); } else { sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0); sdkp->RCD = 0; } sdkp->DPOFUA = (data.device_specific & 0x10) != 0; if (sdp->broken_fua) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling FUA\n"); sdkp->DPOFUA = 0; } else if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw && !sdkp->device->use_16_for_rw) { sd_first_printk(KERN_NOTICE, sdkp, "Uses READ/WRITE(6), disabling FUA\n"); sdkp->DPOFUA = 0; } /* No cache flush allowed for write protected devices */ if (sdkp->WCE && sdkp->write_prot) sdkp->WCE = 0; if (sdkp->first_scan || old_wce != sdkp->WCE || old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA) sd_printk(KERN_NOTICE, sdkp, "Write cache: %s, read cache: %s, %s\n", sdkp->WCE ? "enabled" : "disabled", sdkp->RCD ? "disabled" : "enabled", sdkp->DPOFUA ? "supports DPO and FUA" : "doesn't support DPO or FUA"); return; } bad_sense: if (res == -EIO && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && sshdr.asc == 0x24 && sshdr.ascq == 0x0) /* Invalid field in CDB */ sd_first_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n"); else sd_first_printk(KERN_ERR, sdkp, "Asking for cache data failed\n"); defaults: if (sdp->wce_default_on) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming drive cache: write back\n"); sdkp->WCE = 1; } else { sd_first_printk(KERN_WARNING, sdkp, "Assuming drive cache: write through\n"); sdkp->WCE = 0; } sdkp->RCD = 0; sdkp->DPOFUA = 0; } static bool sd_is_perm_stream(struct scsi_disk *sdkp, unsigned int stream_id) { u8 cdb[16] = { SERVICE_ACTION_IN_16, SAI_GET_STREAM_STATUS }; struct { struct scsi_stream_status_header h; struct scsi_stream_status s; } buf; struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int res; put_unaligned_be16(stream_id, &cdb[4]); put_unaligned_be32(sizeof(buf), &cdb[10]); res = scsi_execute_cmd(sdev, cdb, REQ_OP_DRV_IN, &buf, sizeof(buf), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (res < 0) return false; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); if (res) return false; if (get_unaligned_be32(&buf.h.len) < sizeof(struct scsi_stream_status)) return false; return buf.s.perm; } static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; const struct scsi_io_group_descriptor *desc, *start, *end; u16 permanent_stream_count_old; struct scsi_sense_hdr sshdr; struct scsi_mode_data data; int res; if (sdp->sdev_bflags & BLIST_SKIP_IO_HINTS) return; res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a, /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (res < 0) return; start = (void *)buffer + data.header_length + 16; end = (void *)buffer + ALIGN_DOWN(data.header_length + data.length, sizeof(*end)); /* * From "SBC-5 Constrained Streams with Data Lifetimes": Device severs * should assign the lowest numbered stream identifiers to permanent * streams. */ for (desc = start; desc < end; desc++) if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start)) break; permanent_stream_count_old = sdkp->permanent_stream_count; sdkp->permanent_stream_count = desc - start; if (sdkp->rscs && sdkp->permanent_stream_count < 2) sd_printk(KERN_INFO, sdkp, "Unexpected: RSCS has been set and the permanent stream count is %u\n", sdkp->permanent_stream_count); else if (sdkp->permanent_stream_count != permanent_stream_count_old) sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n", sdkp->permanent_stream_count); } /* * The ATO bit indicates whether the DIF application tag is available * for use by the operating system. */ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) { int res, offset; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return; if (sdkp->protection_type == 0) return; res = scsi_mode_sense(sdp, 1, 0x0a, 0, buffer, 36, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (res < 0 || !data.header_length || data.length < 6) { sd_first_printk(KERN_WARNING, sdkp, "getting Control mode page failed, assume no ATO\n"); if (res == -EIO && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return; } offset = data.header_length + data.block_descriptor_length; if ((buffer[offset] & 0x3f) != 0x0a) { sd_first_printk(KERN_ERR, sdkp, "ATO Got wrong page\n"); return; } if ((buffer[offset + 5] & 0x80) == 0) return; sdkp->ATO = 1; return; } static unsigned int sd_discard_mode(struct scsi_disk *sdkp) { if (!sdkp->lbpme) return SD_LBP_FULL; if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ if (sdkp->max_unmap_blocks) return SD_LBP_UNMAP; return SD_LBP_WS16; } /* LBP VPD page tells us what to use */ if (sdkp->lbpu && sdkp->max_unmap_blocks) return SD_LBP_UNMAP; if (sdkp->lbpws) return SD_LBP_WS16; if (sdkp->lbpws10) return SD_LBP_WS10; return SD_LBP_DISABLE; } /* * Query disk device for preferred I/O sizes. */ static void sd_read_block_limits(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_vpd *vpd; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb0); if (!vpd || vpd->len < 16) goto out; sdkp->min_xfer_blocks = get_unaligned_be16(&vpd->data[6]); sdkp->max_xfer_blocks = get_unaligned_be32(&vpd->data[8]); sdkp->opt_xfer_blocks = get_unaligned_be32(&vpd->data[12]); if (vpd->len >= 64) { unsigned int lba_count, desc_count; sdkp->max_ws_blocks = (u32)get_unaligned_be64(&vpd->data[36]); if (!sdkp->lbpme) goto config_atomic; lba_count = get_unaligned_be32(&vpd->data[20]); desc_count = get_unaligned_be32(&vpd->data[24]); if (lba_count && desc_count) sdkp->max_unmap_blocks = lba_count; sdkp->unmap_granularity = get_unaligned_be32(&vpd->data[28]); if (vpd->data[32] & 0x80) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); config_atomic: sdkp->max_atomic = get_unaligned_be32(&vpd->data[44]); sdkp->atomic_alignment = get_unaligned_be32(&vpd->data[48]); sdkp->atomic_granularity = get_unaligned_be32(&vpd->data[52]); sdkp->max_atomic_with_boundary = get_unaligned_be32(&vpd->data[56]); sdkp->max_atomic_boundary = get_unaligned_be32(&vpd->data[60]); sd_config_atomic(sdkp, lim); } out: rcu_read_unlock(); } /* Parse the Block Limits Extension VPD page (0xb7) */ static void sd_read_block_limits_ext(struct scsi_disk *sdkp) { struct scsi_vpd *vpd; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb7); if (vpd && vpd->len >= 6) sdkp->rscs = vpd->data[5] & 1; rcu_read_unlock(); } /* Query block device characteristics */ static void sd_read_block_characteristics(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_vpd *vpd; u16 rot; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb1); if (!vpd || vpd->len <= 8) { rcu_read_unlock(); return; } rot = get_unaligned_be16(&vpd->data[4]); sdkp->zoned = (vpd->data[8] >> 4) & 3; rcu_read_unlock(); if (rot == 1) lim->features &= ~(BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (!sdkp->first_scan) return; if (sdkp->device->type == TYPE_ZBC) sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n"); else if (sdkp->zoned == 1) sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n"); else if (sdkp->zoned == 2) sd_printk(KERN_NOTICE, sdkp, "Drive-managed SMR disk\n"); } /** * sd_read_block_provisioning - Query provisioning VPD page * @sdkp: disk to query */ static void sd_read_block_provisioning(struct scsi_disk *sdkp) { struct scsi_vpd *vpd; if (sdkp->lbpme == 0) return; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb2); if (!vpd || vpd->len < 8) { rcu_read_unlock(); return; } sdkp->lbpvpd = 1; sdkp->lbpu = (vpd->data[5] >> 7) & 1; /* UNMAP */ sdkp->lbpws = (vpd->data[5] >> 6) & 1; /* WRITE SAME(16) w/ UNMAP */ sdkp->lbpws10 = (vpd->data[5] >> 5) & 1; /* WRITE SAME(10) w/ UNMAP */ rcu_read_unlock(); } static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (sdev->host->no_write_same) { sdev->no_write_same = 1; return; } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, INQUIRY, 0) < 0) { sdev->no_report_opcodes = 1; /* * Disable WRITE SAME if REPORT SUPPORTED OPERATION CODES is * unsupported and this is an ATA device. */ if (sdev->is_ata) sdev->no_write_same = 1; } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16, 0) == 1) sdkp->ws16 = 1; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME, 0) == 1) sdkp->ws10 = 1; } static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (!sdev->security_supported) return; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_IN, 0) == 1 && scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_OUT, 0) == 1) sdkp->security = 1; } static inline sector_t sd64_to_sectors(struct scsi_disk *sdkp, u8 *buf) { return logical_to_sectors(sdkp->device, get_unaligned_be64(buf)); } /** * sd_read_cpr - Query concurrent positioning ranges * @sdkp: disk to query */ static void sd_read_cpr(struct scsi_disk *sdkp) { struct blk_independent_access_ranges *iars = NULL; unsigned char *buffer = NULL; unsigned int nr_cpr = 0; int i, vpd_len, buf_len = SD_BUF_SIZE; u8 *desc; /* * We need to have the capacity set first for the block layer to be * able to check the ranges. */ if (sdkp->first_scan) return; if (!sdkp->capacity) goto out; /* * Concurrent Positioning Ranges VPD: there can be at most 256 ranges, * leading to a maximum page size of 64 + 256*32 bytes. */ buf_len = 64 + 256*32; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb9, buffer, buf_len)) goto out; /* We must have at least a 64B header and one 32B range descriptor */ vpd_len = get_unaligned_be16(&buffer[2]) + 4; if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Ranges VPD page\n"); goto out; } nr_cpr = (vpd_len - 64) / 32; if (nr_cpr == 1) { nr_cpr = 0; goto out; } iars = disk_alloc_independent_access_ranges(sdkp->disk, nr_cpr); if (!iars) { nr_cpr = 0; goto out; } desc = &buffer[64]; for (i = 0; i < nr_cpr; i++, desc += 32) { if (desc[0] != i) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Range number\n"); nr_cpr = 0; break; } iars->ia_range[i].sector = sd64_to_sectors(sdkp, desc + 8); iars->ia_range[i].nr_sectors = sd64_to_sectors(sdkp, desc + 16); } out: disk_set_independent_access_ranges(sdkp->disk, iars); if (nr_cpr && sdkp->nr_actuators != nr_cpr) { sd_printk(KERN_NOTICE, sdkp, "%u concurrent positioning ranges\n", nr_cpr); sdkp->nr_actuators = nr_cpr; } kfree(buffer); } static bool sd_validate_min_xfer_size(struct scsi_disk *sdkp) { struct scsi_device *sdp = sdkp->device; unsigned int min_xfer_bytes = logical_to_bytes(sdp, sdkp->min_xfer_blocks); if (sdkp->min_xfer_blocks == 0) return false; if (min_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Preferred minimum I/O size %u bytes not a " \ "multiple of physical block size (%u bytes)\n", min_xfer_bytes, sdkp->physical_block_size); sdkp->min_xfer_blocks = 0; return false; } sd_first_printk(KERN_INFO, sdkp, "Preferred minimum I/O size %u bytes\n", min_xfer_bytes); return true; } /* * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, not a * multiple of the physical block size, or simply garbage. */ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, unsigned int dev_max) { struct scsi_device *sdp = sdkp->device; unsigned int opt_xfer_bytes = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); unsigned int min_xfer_bytes = logical_to_bytes(sdp, sdkp->min_xfer_blocks); if (sdkp->opt_xfer_blocks == 0) return false; if (sdkp->opt_xfer_blocks > dev_max) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks " \ "> dev_max (%u logical blocks)\n", sdkp->opt_xfer_blocks, dev_max); return false; } if (sdkp->opt_xfer_blocks > SD_DEF_XFER_BLOCKS) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks " \ "> sd driver limit (%u logical blocks)\n", sdkp->opt_xfer_blocks, SD_DEF_XFER_BLOCKS); return false; } if (opt_xfer_bytes < PAGE_SIZE) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes < " \ "PAGE_SIZE (%u bytes)\n", opt_xfer_bytes, (unsigned int)PAGE_SIZE); return false; } if (min_xfer_bytes && opt_xfer_bytes % min_xfer_bytes) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a " \ "multiple of preferred minimum block " \ "size (%u bytes)\n", opt_xfer_bytes, min_xfer_bytes); return false; } if (opt_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a " \ "multiple of physical block size (%u bytes)\n", opt_xfer_bytes, sdkp->physical_block_size); return false; } sd_first_printk(KERN_INFO, sdkp, "Optimal transfer size %u bytes\n", opt_xfer_bytes); return true; } static void sd_read_block_zero(struct scsi_disk *sdkp) { struct scsi_device *sdev = sdkp->device; unsigned int buf_len = sdev->sector_size; u8 *buffer, cmd[16] = { }; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer) return; if (sdev->use_16_for_rw) { cmd[0] = READ_16; put_unaligned_be64(0, &cmd[2]); /* Logical block address 0 */ put_unaligned_be32(1, &cmd[10]);/* Transfer 1 logical block */ } else { cmd[0] = READ_10; put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ } scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len, SD_TIMEOUT, sdkp->max_retries, NULL); kfree(buffer); } /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. * @disk: struct gendisk we care about **/ static void sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; sector_t old_capacity = sdkp->capacity; struct queue_limits *lim = NULL; unsigned char *buffer = NULL; unsigned int dev_max; int err; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); /* * If the device is offline, don't try and read capacity or any * of the other niceties. */ if (!scsi_device_online(sdp)) return; lim = kmalloc(sizeof(*lim), GFP_KERNEL); if (!lim) return; buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL); if (!buffer) goto out; sd_spinup_disk(sdkp); *lim = queue_limits_start_update(sdkp->disk->queue); /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { sd_read_capacity(sdkp, lim, buffer); /* * Some USB/UAS devices return generic values for mode pages * until the media has been accessed. Trigger a READ operation * to force the device to populate mode pages. */ if (sdp->read_before_ms) sd_read_block_zero(sdkp); /* * set the default to rotational. All non-rotational devices * support the block characteristics VPD page, which will * cause this to be updated correctly and any device which * doesn't support it should be treated as rotational. */ lim->features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); sd_read_block_limits(sdkp, lim); sd_read_block_limits_ext(sdkp); sd_read_block_characteristics(sdkp, lim); sd_zbc_read_zones(sdkp, lim, buffer); } sd_config_discard(sdkp, lim, sd_discard_mode(sdkp)); sd_print_capacity(sdkp, old_capacity); sd_read_write_protect_flag(sdkp, buffer); sd_read_cache_type(sdkp, buffer); sd_read_io_hints(sdkp, buffer); sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); sd_config_protection(sdkp, lim); } /* * We now have all cache related info, determine how we deal * with flush requests. */ sd_set_flush_flag(sdkp, lim); /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); lim->max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_min_xfer_size(sdkp)) lim->io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); else lim->io_min = 0; /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ lim->io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; if (sd_validate_opt_xfer_size(sdkp, dev_max)) { lim->io_opt = min_not_zero(lim->io_opt, logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp, lim); err = queue_limits_commit_update_frozen(sdkp->disk->queue, lim); if (err) goto out; /* * Query concurrent positioning ranges after * queue_limits_commit_update() unlocked q->limits_lock to avoid * deadlock with q->sysfs_dir_lock and q->sysfs_lock. */ if (sdkp->media_present && scsi_device_supports_vpd(sdp)) sd_read_cpr(sdkp); /* * For a zoned drive, revalidating the zones can be done only once * the gendisk capacity is set. So if this fails, set back the gendisk * capacity to 0. */ if (sd_zbc_revalidate_zones(sdkp)) set_capacity_and_notify(disk, 0); out: kfree(buffer); kfree(lim); } /** * sd_unlock_native_capacity - unlock native capacity * @disk: struct gendisk to set capacity for * * Block layer calls this function if it detects that partitions * on @disk reach beyond the end of the device. If the SCSI host * implements ->unlock_native_capacity() method, it's invoked to * give it a chance to adjust the device capacity. * * CONTEXT: * Defined by block layer. Might sleep. */ static void sd_unlock_native_capacity(struct gendisk *disk) { struct scsi_device *sdev = scsi_disk(disk)->device; if (sdev->host->hostt->unlock_native_capacity) sdev->host->hostt->unlock_native_capacity(sdev); } /** * sd_format_disk_name - format disk name * @prefix: name prefix - ie. "sd" for SCSI disks * @index: index of the disk to format name for * @buf: output buffer * @buflen: length of the output buffer * * SCSI disk names starts at sda. The 26th device is sdz and the * 27th is sdaa. The last one for two lettered suffix is sdzz * which is followed by sdaaa. * * This is basically 26 base counting with one extra 'nil' entry * at the beginning from the second digit on and can be * determined using similar method as 26 base conversion with the * index shifted -1 after each digit is computed. * * CONTEXT: * Don't care. * * RETURNS: * 0 on success, -errno on failure. */ static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen) { const int base = 'z' - 'a' + 1; char *begin = buf + strlen(prefix); char *end = buf + buflen; char *p; int unit; p = end - 1; *p = '\0'; unit = base; do { if (p == begin) return -EINVAL; *--p = 'a' + (index % unit); index = (index / unit) - 1; } while (index >= 0); memmove(begin, p, end - p); memcpy(buf, prefix, strlen(prefix)); return 0; } /** * sd_probe - called during driver initialization and whenever a * new scsi device is attached to the system. It is called once * for each scsi device (not just disks) present. * @dev: pointer to device object * * Returns 0 if successful (or not interested in this scsi device * (e.g. scanner)); 1 when there is an error. * * Note: this function is invoked from the scsi mid-level. * This function sets up the mapping between a given * <host,channel,id,lun> (found in sdp) and new device name * (e.g. /dev/sda). More precisely it is the block device major * and minor number that is chosen here. * * Assume sd_probe is not re-entrant (for time being) * Also think about sd_probe() and sd_remove() running coincidentally. **/ static int sd_probe(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); struct scsi_disk *sdkp; struct gendisk *gd; int index; int error; scsi_autopm_get_device(sdp); error = -ENODEV; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC) goto out; if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && sdp->type == TYPE_ZBC) { sdev_printk(KERN_WARNING, sdp, "Unsupported ZBC host-managed device.\n"); goto out; } SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp, "sd_probe\n")); error = -ENOMEM; sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL); if (!sdkp) goto out; gd = blk_mq_alloc_disk_for_queue(sdp->request_queue, &sd_bio_compl_lkclass); if (!gd) goto out_free; index = ida_alloc(&sd_index_ida, GFP_KERNEL); if (index < 0) { sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n"); goto out_put; } error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN); if (error) { sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n"); goto out_free_index; } sdkp->device = sdp; sdkp->disk = gd; sdkp->index = index; sdkp->max_retries = SD_MAX_RETRIES; atomic_set(&sdkp->openers, 0); atomic_set(&sdkp->device->ioerr_cnt, 0); if (!sdp->request_queue->rq_timeout) { if (sdp->type != TYPE_MOD) blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT); else blk_queue_rq_timeout(sdp->request_queue, SD_MOD_TIMEOUT); } device_initialize(&sdkp->disk_dev); sdkp->disk_dev.parent = get_device(dev); sdkp->disk_dev.class = &sd_disk_class; dev_set_name(&sdkp->disk_dev, "%s", dev_name(dev)); error = device_add(&sdkp->disk_dev); if (error) { put_device(&sdkp->disk_dev); goto out; } dev_set_drvdata(dev, sdkp); gd->major = sd_major((index & 0xf0) >> 4); gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00); gd->minors = SD_MINORS; gd->fops = &sd_fops; gd->private_data = sdkp; /* defaults, until the device tells us otherwise */ sdp->sector_size = 512; sdkp->capacity = 0; sdkp->media_present = 1; sdkp->write_prot = 0; sdkp->cache_override = 0; sdkp->WCE = 0; sdkp->RCD = 0; sdkp->ATO = 0; sdkp->first_scan = 1; sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS; sd_revalidate_disk(gd); if (sdp->removable) { gd->flags |= GENHD_FL_REMOVABLE; gd->events |= DISK_EVENT_MEDIA_CHANGE; gd->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT; } blk_pm_runtime_init(sdp->request_queue, dev); if (sdp->rpm_autosuspend) { pm_runtime_set_autosuspend_delay(dev, sdp->host->rpm_autosuspend_delay); } error = device_add_disk(dev, gd, NULL); if (error) { device_unregister(&sdkp->disk_dev); put_disk(gd); goto out; } if (sdkp->security) { sdkp->opal_dev = init_opal_dev(sdkp, &sd_sec_submit); if (sdkp->opal_dev) sd_printk(KERN_NOTICE, sdkp, "supports TCG Opal\n"); } sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n", sdp->removable ? "removable " : ""); scsi_autopm_put_device(sdp); return 0; out_free_index: ida_free(&sd_index_ida, index); out_put: put_disk(gd); out_free: kfree(sdkp); out: scsi_autopm_put_device(sdp); return error; } /** * sd_remove - called whenever a scsi disk (previously recognized by * sd_probe) is detached from the system. It is called (potentially * multiple times) during sd module unload. * @dev: pointer to device object * * Note: this function is invoked from the scsi mid-level. * This function potentially frees up a device name (e.g. /dev/sdc) * that could be re-used by a subsequent sd_probe(). * This function is not called when the built-in sd driver is "exit-ed". **/ static int sd_remove(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); scsi_autopm_get_device(sdkp->device); device_del(&sdkp->disk_dev); del_gendisk(sdkp->disk); if (!sdkp->suspended) sd_shutdown(dev); put_disk(sdkp->disk); return 0; } static void scsi_disk_release(struct device *dev) { struct scsi_disk *sdkp = to_scsi_disk(dev); ida_free(&sd_index_ida, sdkp->index); put_device(&sdkp->device->sdev_gendev); free_opal_dev(sdkp->opal_dev); kfree(sdkp); } static int sd_start_stop_device(struct scsi_disk *sdkp, int start) { unsigned char cmd[6] = { START_STOP }; /* START_VALID */ struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { /* Power on, reset, or bus device reset occurred */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 0, .result = SAM_STAT_CHECK_CONDITION, }, { /* Power on occurred */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 1, .result = SAM_STAT_CHECK_CONDITION, }, { /* SCSI bus reset */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 2, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .total_allowed = 3, .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .req_flags = BLK_MQ_REQ_PM, .failures = &failures, }; struct scsi_device *sdp = sdkp->device; int res; if (start) cmd[4] |= 1; /* START */ if (sdp->start_stop_pwr_cond) cmd[4] |= start ? 1 << 4 : 3 << 4; /* Active or Standby */ if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (res) { sd_print_result(sdkp, "Start/Stop Unit failed", res); if (res > 0 && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* 0x3a is medium not present */ if (sshdr.asc == 0x3a) res = 0; } } /* SCSI error codes must not go to the generic layer */ if (res) return -EIO; return 0; } /* * Send a SYNCHRONIZE CACHE instruction down to the device through * the normal SCSI command structure. Wait for the command to * complete. */ static void sd_shutdown(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); if (!sdkp) return; /* this can happen */ if (pm_runtime_suspended(dev)) return; if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); sd_sync_cache(sdkp); } if ((system_state != SYSTEM_RESTART && sdkp->device->manage_system_start_stop) || (system_state == SYSTEM_POWER_OFF && sdkp->device->manage_shutdown) || (system_state == SYSTEM_RUNNING && sdkp->device->manage_runtime_start_stop)) { sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); sd_start_stop_device(sdkp, 0); } } static inline bool sd_do_start_stop(struct scsi_device *sdev, bool runtime) { return (sdev->manage_system_start_stop && !runtime) || (sdev->manage_runtime_start_stop && runtime); } static int sd_suspend_common(struct device *dev, bool runtime) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ return 0; if (sdkp->WCE && sdkp->media_present) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); ret = sd_sync_cache(sdkp); /* ignore OFFLINE device */ if (ret == -ENODEV) return 0; if (ret) return ret; } if (sd_do_start_stop(sdkp->device, runtime)) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); /* an error is not worth aborting a system sleep */ ret = sd_start_stop_device(sdkp, 0); if (!runtime) ret = 0; } if (!ret) sdkp->suspended = true; return ret; } static int sd_suspend_system(struct device *dev) { if (pm_runtime_suspended(dev)) return 0; return sd_suspend_common(dev, false); } static int sd_suspend_runtime(struct device *dev) { return sd_suspend_common(dev, true); } static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); if (opal_unlock_from_suspend(sdkp->opal_dev)) { sd_printk(KERN_NOTICE, sdkp, "OPAL unlock failed\n"); return -EIO; } return 0; } static int sd_resume_common(struct device *dev, bool runtime) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; if (!sd_do_start_stop(sdkp->device, runtime)) { sdkp->suspended = false; return 0; } sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); ret = sd_start_stop_device(sdkp, 1); if (!ret) { sd_resume(dev); sdkp->suspended = false; } return ret; } static int sd_resume_system(struct device *dev) { if (pm_runtime_suspended(dev)) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_device *sdp = sdkp ? sdkp->device : NULL; if (sdp && sdp->force_runtime_start_on_system_start) pm_request_resume(dev); return 0; } return sd_resume_common(dev, false); } static int sd_resume_runtime(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_device *sdp; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; sdp = sdkp->device; if (sdp->ignore_media_change) { /* clear the device's sense data */ static const u8 cmd[10] = { REQUEST_SENSE }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; if (scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, sdp->request_queue->rq_timeout, 1, &exec_args)) sd_printk(KERN_NOTICE, sdkp, "Failed to clear sense data\n"); } return sd_resume_common(dev, true); } static const struct dev_pm_ops sd_pm_ops = { .suspend = sd_suspend_system, .resume = sd_resume_system, .poweroff = sd_suspend_system, .restore = sd_resume_system, .runtime_suspend = sd_suspend_runtime, .runtime_resume = sd_resume_runtime, }; static struct scsi_driver sd_template = { .gendrv = { .name = "sd", .probe = sd_probe, .probe_type = PROBE_PREFER_ASYNCHRONOUS, .remove = sd_remove, .shutdown = sd_shutdown, .pm = &sd_pm_ops, }, .rescan = sd_rescan, .resume = sd_resume, .init_command = sd_init_command, .uninit_command = sd_uninit_command, .done = sd_done, .eh_action = sd_eh_action, .eh_reset = sd_eh_reset, }; /** * init_sd - entry point for this driver (both when built in or when * a module). * * Note: this function registers this driver with the scsi mid-level. **/ static int __init init_sd(void) { int majors = 0, i, err; SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); for (i = 0; i < SD_MAJORS; i++) { if (__register_blkdev(sd_major(i), "sd", sd_default_probe)) continue; majors++; } if (!majors) return -ENODEV; err = class_register(&sd_disk_class); if (err) goto err_out; sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, 0); if (!sd_page_pool) { printk(KERN_ERR "sd: can't init discard page pool\n"); err = -ENOMEM; goto err_out_class; } err = scsi_register_driver(&sd_template.gendrv); if (err) goto err_out_driver; return 0; err_out_driver: mempool_destroy(sd_page_pool); err_out_class: class_unregister(&sd_disk_class); err_out: for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); return err; } /** * exit_sd - exit point for this driver (when it is a module). * * Note: this function unregisters this driver from the scsi mid-level. **/ static void __exit exit_sd(void) { int i; SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n")); scsi_unregister_driver(&sd_template.gendrv); mempool_destroy(sd_page_pool); class_unregister(&sd_disk_class); for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); } module_init(init_sd); module_exit(exit_sd); void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { scsi_print_sense_hdr(sdkp->device, sdkp->disk ? sdkp->disk->disk_name : NULL, sshdr); } void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result) { const char *hb_string = scsi_hostbyte_string(result); if (hb_string) sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=%s driverbyte=%s\n", msg, hb_string ? hb_string : "invalid", "DRIVER_OK"); else sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=0x%02x driverbyte=%s\n", msg, host_byte(result), "DRIVER_OK"); }
10 10 10 199 199 189 10 10 156 33 149 97 23 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/ip.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/netfilter/nf_conntrack_zones.h> static DEFINE_MUTEX(defrag4_mutex); static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) skb->ignore_df = 1; return err; } static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, struct sk_buff *skb) { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); } #endif if (nf_bridge_in_prerouting(skb)) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) return IP_DEFRAG_CONNTRACK_IN + zone_id; else return IP_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv4_conntrack_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) #if !IS_ENABLED(CONFIG_NF_NAT) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif if (skb->_nfct == IP_CT_UNTRACKED) return NF_ACCEPT; #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = nf_ct_defrag_user(state->hook, skb); if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; } static const struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, }; static void __net_exit defrag4_net_exit(struct net *net) { if (net->nf.defrag_ipv4_users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); net->nf.defrag_ipv4_users = 0; } } static const struct nf_defrag_hook defrag_hook = { .owner = THIS_MODULE, .enable = nf_defrag_ipv4_enable, .disable = nf_defrag_ipv4_disable, }; static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, }; static int __init nf_defrag_init(void) { int err; err = register_pernet_subsys(&defrag4_net_ops); if (err) return err; rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook); return err; } static void __exit nf_defrag_fini(void) { rcu_assign_pointer(nf_defrag_v4_hook, NULL); unregister_pernet_subsys(&defrag4_net_ops); } int nf_defrag_ipv4_enable(struct net *net) { int err = 0; mutex_lock(&defrag4_mutex); if (net->nf.defrag_ipv4_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } if (net->nf.defrag_ipv4_users) { net->nf.defrag_ipv4_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) net->nf.defrag_ipv4_users = 1; out_unlock: mutex_unlock(&defrag4_mutex); return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); void nf_defrag_ipv4_disable(struct net *net) { mutex_lock(&defrag4_mutex); if (net->nf.defrag_ipv4_users) { net->nf.defrag_ipv4_users--; if (net->nf.defrag_ipv4_users == 0) nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); } mutex_unlock(&defrag4_mutex); } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable); module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv4 defragmentation support");
15 15 14 15 3017 2842 271 3018 3019 3015 2717 572 3019 1543 1547 1468 86 1542 1539 1529 16 87 87 71 16 71 71 636 636 233 234 336 23 3829 3828 23 25 25 25 1544 1546 1389 1413 155 223 37 1542 1543 1545 1543 6 1546 1549 310 1250 1542 1543 1545 11 11 11 11 223 218 224 224 218 10 18 218 2513 2515 2518 51 2515 2516 1204 1207 261 329 171 217 218 217 224 223 18 218 218 11 11 85 1544 71 1487 1545 1782 1781 1786 1544 1545 1549 542 310 87 87 87 87 87 384 384 87 468 547 56 542 543 543 390 17 385 384 385 68 26 43 104 104 31 87 88 71 678 285 414 3 2 679 104 103 17 104 87 72 72 53 69 40 40 31 18 40 263 263 52 52 23 24 23 1443 1447 1784 1771 41 1633 1638 1302 378 5 2 375 100 376 4 4 435 424 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1997 Linus Torvalds * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) */ #include <linux/export.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> #include <linux/swap.h> #include <linux/security.h> #include <linux/cdev.h> #include <linux/memblock.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> #include <linux/iversion.h> #include <linux/rw_hint.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <trace/events/writeback.h> #define CREATE_TRACE_POINTS #include <trace/events/timestamp.h> #include "internal.h" /* * Inode locking rules: * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget(), inode->i_io_list * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode->i_sb->s_inode_list_lock protects: * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock * inode_hash_lock */ static unsigned int i_hash_mask __ro_after_init; static unsigned int i_hash_shift __ro_after_init; static struct hlist_head *inode_hashtable __ro_after_init; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. */ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __ro_after_init; static long get_nr_inodes(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } static inline long get_nr_inodes_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } #ifdef CONFIG_DEBUG_FS static DEFINE_PER_CPU(long, mg_ctime_updates); static DEFINE_PER_CPU(long, mg_fine_stamps); static DEFINE_PER_CPU(long, mg_ctime_swaps); static unsigned long get_mg_ctime_updates(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_ctime_updates, i)); return sum; } static unsigned long get_mg_fine_stamps(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_fine_stamps, i)); return sum; } static unsigned long get_mg_ctime_swaps(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_ctime_swaps, i)); return sum; } #define mgtime_counter_inc(__var) this_cpu_inc(__var) static int mgts_show(struct seq_file *s, void *p) { unsigned long ctime_updates = get_mg_ctime_updates(); unsigned long ctime_swaps = get_mg_ctime_swaps(); unsigned long fine_stamps = get_mg_fine_stamps(); unsigned long floor_swaps = timekeeping_get_mg_floor_swaps(); seq_printf(s, "%lu %lu %lu %lu\n", ctime_updates, ctime_swaps, fine_stamps, floor_swaps); return 0; } DEFINE_SHOW_ATTRIBUTE(mgts); static int __init mg_debugfs_init(void) { debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops); return 0; } late_initcall(mg_debugfs_init); #else /* ! CONFIG_DEBUG_FS */ #define mgtime_counter_inc(__var) do { } while (0) #endif /* CONFIG_DEBUG_FS */ /* * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL /* * Statistics gathering.. */ static struct inodes_stat_t inodes_stat; static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table inodes_sysctls[] = { { .procname = "inode-nr", .data = &inodes_stat, .maxlen = 2*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, }; static int __init init_fs_inode_sysctls(void) { register_sysctl_init("fs", inodes_sysctls); return 0; } early_initcall(init_fs_inode_sysctls); #endif static int no_open(struct inode *inode, struct file *file) { return -ENXIO; } /** * inode_init_always_gfp - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise * @gfp: allocation flags * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. * If there are additional allocations required @gfp is used. */ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp) { static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; inode->i_state = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; if (sb->s_type->fs_flags & FS_MGTIME) inode->i_opflags |= IOP_MGTIME; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; #ifdef CONFIG_CGROUP_WRITEBACK inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; #endif spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); if (sb->s_iflags & SB_I_STABLE_WRITES) mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif #ifdef CONFIG_FSNOTIFY inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; if (unlikely(security_inode_alloc(inode, gfp))) return -ENOMEM; this_cpu_inc(nr_inodes); return 0; } EXPORT_SYMBOL(inode_init_always_gfp); void free_inode_nonrcu(struct inode *inode) { kmem_cache_free(inode_cachep, inode); } EXPORT_SYMBOL(free_inode_nonrcu); static void i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); if (inode->free_inode) inode->free_inode(inode); else free_inode_nonrcu(inode); } /** * alloc_inode - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. * Inode wont be chained in superblock s_inodes list * This means : * - fs can't be unmount * - quotas, fsnotify, writeback can't work */ struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; if (ops->alloc_inode) inode = ops->alloc_inode(sb); else inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; if (unlikely(inode_init_always(sb, inode))) { if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return NULL; } inode->free_inode = ops->free_inode; i_callback(&inode->i_rcu); return NULL; } return inode; } void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode); if (!inode->i_nlink) { WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count); } #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl && !is_uncached_acl(inode->i_acl)) posix_acl_release(inode->i_acl); if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) posix_acl_release(inode->i_default_acl); #endif this_cpu_dec(nr_inodes); } EXPORT_SYMBOL(__destroy_inode); static void destroy_inode(struct inode *inode) { const struct super_operations *ops = inode->i_sb->s_op; BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return; } inode->free_inode = ops->free_inode; call_rcu(&inode->i_rcu, i_callback); } /** * drop_nlink - directly drop an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. In cases * where we are attempting to track writes to the * filesystem, a decrement to zero means an imminent * write when the file is truncated and actually unlinked * on the filesystem. */ void drop_nlink(struct inode *inode) { WARN_ON(inode->i_nlink == 0); inode->__i_nlink--; if (!inode->i_nlink) atomic_long_inc(&inode->i_sb->s_remove_count); } EXPORT_SYMBOL(drop_nlink); /** * clear_nlink - directly zero an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. See * drop_nlink() for why we care about i_nlink hitting zero. */ void clear_nlink(struct inode *inode) { if (inode->i_nlink) { inode->__i_nlink = 0; atomic_long_inc(&inode->i_sb->s_remove_count); } } EXPORT_SYMBOL(clear_nlink); /** * set_nlink - directly set an inode's link count * @inode: inode * @nlink: new nlink (should be non-zero) * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. */ void set_nlink(struct inode *inode, unsigned int nlink) { if (!nlink) { clear_nlink(inode); } else { /* Yes, some filesystems do change nlink from zero to one */ if (inode->i_nlink == 0) atomic_long_dec(&inode->i_sb->s_remove_count); inode->__i_nlink = nlink; } } EXPORT_SYMBOL(set_nlink); /** * inc_nlink - directly increment an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. Currently, * it is only here for parity with dec_nlink(). */ void inc_nlink(struct inode *inode) { if (unlikely(inode->i_nlink == 0)) { WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); } inode->__i_nlink++; } EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->i_private_list); spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); __address_space_init_once(mapping); } EXPORT_SYMBOL(address_space_init_once); /* * These are initializations that only need to be done * once, because the fields are idempotent across use * of the inode, so let the slab aware of that. */ void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } EXPORT_SYMBOL(inode_init_once); static void init_once(void *foo) { struct inode *inode = (struct inode *) foo; inode_init_once(inode); } /* * get additional reference to inode; caller must already hold one. */ void ihold(struct inode *inode) { WARN_ON(atomic_inc_return(&inode->i_count) < 2); } EXPORT_SYMBOL(ihold); static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (icount_read(inode)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; if (!mapping_shrinkable(&inode->i_data)) return; if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) inode->i_state |= I_REFERENCED; } struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { void *bit_address; bit_address = inode_state_wait_address(inode, bit); init_wait_var_entry(wqe, bit_address, 0); return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); /* * Add inode to LRU if needed (inode is unused and clean). * * Needs inode->i_lock held. */ void inode_add_lru(struct inode *inode) { __inode_add_lru(inode, false); } static void inode_lru_list_del(struct inode *inode) { if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } static void inode_pin_lru_isolating(struct inode *inode) { lockdep_assert_held(&inode->i_lock); WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); inode->i_state |= I_LRU_ISOLATING; } static void inode_unpin_lru_isolating(struct inode *inode) { spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); inode->i_state &= ~I_LRU_ISOLATING; /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); } static void inode_wait_for_lru_isolating(struct inode *inode) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; lockdep_assert_held(&inode->i_lock); if (!(inode->i_state & I_LRU_ISOLATING)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); for (;;) { prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* * Checking I_LRU_ISOLATING with inode->i_lock guarantees * memory ordering. */ if (!(inode->i_state & I_LRU_ISOLATING)) break; spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); } finish_wait(wq_head, &wqe.wq_entry); WARN_ON(inode->i_state & I_LRU_ISOLATING); } /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add */ void inode_sb_list_add(struct inode *inode) { struct super_block *sb = inode->i_sb; spin_lock(&sb->s_inode_list_lock); list_add(&inode->i_sb_list, &sb->s_inodes); spin_unlock(&sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { struct super_block *sb = inode->i_sb; if (!list_empty(&inode->i_sb_list)) { spin_lock(&sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); spin_unlock(&sb->s_inode_list_lock); } } static unsigned long hash(struct super_block *sb, unsigned long hashval) { unsigned long tmp; tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / L1_CACHE_BYTES; tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); return tmp & i_hash_mask; } /** * __insert_inode_hash - hash an inode * @inode: unhashed inode * @hashval: unsigned long value used to locate this object in the * inode_hashtable. * * Add an inode to the inode hash for this superblock. */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_add_head_rcu(&inode->i_hash, b); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); /** * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_del_init_rcu(&inode->i_hash); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__remove_inode_hash); void dump_mapping(const struct address_space *mapping) { struct inode *host; const struct address_space_operations *a_ops; struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; char fname[64] = {}; unsigned long ino; /* * If mapping is an invalid pointer, we don't want to crash * accessing it, so probe everything depending on it carefully. */ if (get_kernel_nofault(host, &mapping->host) || get_kernel_nofault(a_ops, &mapping->a_ops)) { pr_warn("invalid mapping:%px\n", mapping); return; } if (!host) { pr_warn("aops:%ps\n", a_ops); return; } if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || get_kernel_nofault(ino, &host->i_ino)) { pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); return; } if (!dentry_first) { pr_warn("aops:%ps ino:%lx\n", a_ops, ino); return; } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); if (get_kernel_nofault(dentry, dentry_ptr) || !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; } if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0) strscpy(fname, "<invalid>"); /* * Even if strncpy_from_kernel_nofault() succeeded, * the fname could be unreliable */ pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", a_ops, ino, fname); } void clear_inode(struct inode *inode) { /* * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __filemap_remove_folio()) * and we must not free the mapping under it. */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); /* * Almost always, mapping_empty(&inode->i_data) here; but there are * two known and long-standing ways in which nodes may get left behind * (when deep radix-tree node allocation failed partway; or when THP * collapse_file() failed). Until those two known cases are cleaned up, * or a cleanup function is called here, do not BUG_ON(!mapping_empty), * nor even WARN_ON(!mapping_empty). */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } EXPORT_SYMBOL(clear_inode); /* * Free the inode passed in, removing it from the lists it is still connected * to. We remove any pages still attached to the inode and wait for any IO that * is still in progress before finally destroying the inode. * * An inode must already be marked I_FREEING so that we avoid the inode being * moved back onto lists if we race with other code that manipulates the lists * (e.g. writeback_single_inode). The caller is responsible for setting this. * * An inode must already be removed from the LRU list before being evicted from * the cache. This should occur atomically with setting the I_FREEING state * flag, so no inodes here should ever be on the LRU when being evicted. */ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); if (!list_empty(&inode->i_io_list)) inode_io_list_del(inode); inode_sb_list_del(inode); spin_lock(&inode->i_lock); inode_wait_for_lru_isolating(inode); /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since * the inode has I_FREEING set, flusher thread won't start new work on * the inode. We just have to wait for running writeback to finish. */ inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); if (op->evict_inode) { op->evict_inode(inode); } else { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); remove_inode_hash(inode); /* * Wake up waiters in __wait_on_freeing_inode(). * * It is an invariant that any thread we need to wake up is already * accounted for before remove_inode_hash() acquires ->i_lock -- both * sides take the lock and sleep is aborted if the inode is found * unhashed. Thus either the sleeper wins and goes off CPU, or removal * wins and the sleeper aborts after testing with the lock. * * This also means we don't need any fences for the call below. */ inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); destroy_inode(inode); } /* * dispose_list - dispose of the contents of a local list * @head: the head of the list to free * * Dispose-list gets a local list with local inodes in it, so it doesn't * need to worry about list corruption and SMP locks. */ static void dispose_list(struct list_head *head) { while (!list_empty(head)) { struct inode *inode; inode = list_first_entry(head, struct inode, i_lru); list_del_init(&inode->i_lru); evict(inode); cond_resched(); } } /** * evict_inodes - evict all evictable inodes for a superblock * @sb: superblock to operate on * * Make sure that no inodes with zero refcount are retained. This is * called by superblock shutdown after having SB_ACTIVE flag removed, * so any inode reaching zero refcount during or after that call will * be immediately evicted. */ void evict_inodes(struct super_block *sb) { struct inode *inode; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (icount_read(inode)) continue; spin_lock(&inode->i_lock); if (icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); /* * We can have a ton of inodes to evict at unmount time given * enough memory, check to see if we need to go to sleep for a * bit so we don't livelock. */ if (need_resched()) { spin_unlock(&sb->s_inode_list_lock); cond_resched(); dispose_list(&dispose); goto again; } } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } EXPORT_SYMBOL_GPL(evict_inodes); /* * Isolate the inode from the LRU in preparation for freeing it. * * If the inode has the I_REFERENCED flag set, then it means that it has been * used recently - the flag is set in iput_final(). When we encounter such an * inode, clear the flag and move it to the back of the LRU so it gets another * pass through the LRU before it gets reclaimed. This is necessary because of * the fact we are doing lazy LRU updates to minimise lock contention so the * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ static enum lru_status inode_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); /* * We are inverting the lru lock/inode->i_lock here, so use a * trylock. If we fail to get the lock, just skip it. */ if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; /* * Inodes can get referenced, redirtied, or repopulated while * they're already on the LRU, and this can make them * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ if (icount_read(inode) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* Recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; } /* * On highmem systems, mapping_shrinkable() permits dropping * page cache in order to free up struct inodes: lowmem might * be under pressure before the cache inside the highmem zone. */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); spin_unlock(&lru->lock); if (remove_inode_buffers(inode)) { unsigned long reap; reap = invalidate_mapping_pages(&inode->i_data, 0, -1); if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); mm_account_reclaimed_pages(reap); } inode_unpin_lru_isolating(inode); return LRU_RETRY; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* * Walk the superblock inode LRU for freeable inodes and attempt to free them. * This is called from the superblock shrinker function with a number of inodes * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(freeable); long freed; freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, inode_lru_isolate, &freeable); dispose_list(&freeable); return freed; } static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data, bool is_inode_hash_locked) { struct inode *inode = NULL; if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); rcu_read_lock(); repeat: hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; } rcu_read_unlock(); return NULL; } /* * find_inode_fast is the fast path version of find_inode, see the comment at * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, bool is_inode_hash_locked) { struct inode *inode = NULL; if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); rcu_read_lock(); repeat: hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; } rcu_read_unlock(); return NULL; } /* * Each cpu owns a range of LAST_INO_BATCH numbers. * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, * to renew the exhausted range. * * This does not significantly increase overflow rate because every CPU can * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the * 2^32 range, and is a worst-case. Even a 50% wastage would only increase * overflow rate by 2x, which does not seem too significant. * * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ #define LAST_INO_BATCH 1024 static DEFINE_PER_CPU(unsigned int, last_ino); unsigned int get_next_ino(void) { unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; #ifdef CONFIG_SMP if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { static atomic_t shared_last_ino; int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); res = next - LAST_INO_BATCH; } #endif res++; /* get_next_ino should not provide a 0 inode number */ if (unlikely(!res)) res++; *p = res; put_cpu_var(last_ino); return res; } EXPORT_SYMBOL(get_next_ino); /** * new_inode - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. The default gfp_mask * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. * If HIGHMEM pages are unsuitable or it is known that pages allocated * for the page cache are not reclaimable or migratable, * mapping_set_gfp_mask() must be called with suitable flags on the * newly created inode's mapping * */ struct inode *new_inode(struct super_block *sb) { struct inode *inode; inode = alloc_inode(sb); if (inode) inode_sb_list_add(inode); return inode; } EXPORT_SYMBOL(new_inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC void lockdep_annotate_inode_mutex_key(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_rwsem */ init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); } } } EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif /** * unlock_new_inode - clear the I_NEW state and wake up any waiters * @inode: new inode to unlock * * Called when the inode is fully initialised to clear the new state of the * inode and wake up anyone waiting for the inode to finish initialisation. */ void unlock_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); void discard_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } EXPORT_SYMBOL(discard_new_inode); /** * lock_two_nondirectories - take two i_mutexes on non-directory objects * * Lock any non-NULL argument. Passed objects must not be directories. * Zero, one or two objects may be locked by this function. * * @inode1: first inode to lock * @inode2: second inode to lock */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1) WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); if (inode2) WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); if (inode1 > inode2) swap(inode1, inode2); if (inode1) inode_lock(inode1); if (inode2 && inode2 != inode1) inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); /** * unlock_two_nondirectories - release locks from lock_two_nondirectories() * @inode1: first inode to unlock * @inode2: second inode to unlock */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1) { WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); inode_unlock(inode1); } if (inode2 && inode2 != inode1) { WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); inode_unlock(inode2); } } EXPORT_SYMBOL(unlock_two_nondirectories); /** * inode_insert5 - obtain an inode from a mounted file system * @inode: pre-allocated inode to use for insert to cache * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a * variant of iget5_locked() that doesn't allocate an inode. * * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; might_sleep(); again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. * Use the old inode instead of the preallocated one. */ spin_unlock(&inode_hash_lock); if (IS_ERR(old)) return NULL; wait_on_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); goto again; } return old; } if (set && unlikely(set(inode, data))) { spin_unlock(&inode_hash_lock); return NULL; } /* * Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); /* * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); return inode; } EXPORT_SYMBOL(inode_insert5); /** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * * If the inode is not present in the cache, allocate and insert a new inode * and return it locked, hashed, and with the I_NEW flag set. The file system * gets to fill it in before unlocking it via unlock_new_inode(). * * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct inode *inode = ilookup5(sb, hashval, test, data); if (!inode) { struct inode *new = alloc_inode(sb); if (new) { inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } } return inode; } EXPORT_SYMBOL(iget5_locked); /** * iget5_locked_rcu - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * This is equivalent to iget5_locked, except the @test callback must * tolerate the inode not being stable, including being mid-teardown. */ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; might_sleep(); again: inode = find_inode(sb, head, test, data, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } new = alloc_inode(sb); if (new) { inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } return inode; } EXPORT_SYMBOL_GPL(iget5_locked_rcu); /** * iget_locked - obtain an inode from a mounted file system * @sb: super block of file system * @ino: inode number to get * * Search for the inode specified by @ino in the inode cache and if present * return it with an increased reference count. This is for file systems * where the inode number is sufficient for unique identification of an inode. * * If the inode is not in cache, allocate a new inode and return it locked, * hashed, and with the I_NEW flag set. The file system gets to fill it in * before unlocking it via unlock_new_inode(). */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; might_sleep(); again: inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } inode = alloc_inode(sb); if (inode) { struct inode *old; spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino, true); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); inode_sb_list_add(inode); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ return inode; } /* * Uhhuh, somebody else created the same inode under * us. Use the old inode instead of the one we just * allocated. */ spin_unlock(&inode_hash_lock); destroy_inode(inode); if (IS_ERR(old)) return NULL; inode = old; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(iget_locked); /* * search the inode cache for a matching inode number. * If we find one, then the inode number we are trying to * allocate is not unique and so we should not use it. * * Returns 1 if the inode number is unique, 0 if it is not. */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; hlist_for_each_entry_rcu(inode, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) return 0; } return 1; } /** * iunique - get a unique inode number * @sb: superblock * @max_reserved: highest reserved inode number * * Obtain an inode number that is unique on the system for a given * superblock. This is used by file systems that have no natural * permanent inode numbering system. An inode number is returned that * is higher than the reserved limit but unique. * * BUGS: * With a large number of inodes live on the file system this function * currently becomes quite slow. */ ino_t iunique(struct super_block *sb, ino_t max_reserved) { /* * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ static DEFINE_SPINLOCK(iunique_lock); static unsigned int counter; ino_t res; rcu_read_lock(); spin_lock(&iunique_lock); do { if (counter <= max_reserved) counter = max_reserved + 1; res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); rcu_read_unlock(); return res; } EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; } return inode; } EXPORT_SYMBOL(igrab); /** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented * reference count. * * Note: I_NEW is not waited upon so you have to be very careful what you do * with the returned inode. You probably should be using ilookup5() instead. * * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data, true); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; } EXPORT_SYMBOL(ilookup5_nowait); /** * ilookup5 - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache, * and if the inode is in the cache, return the inode with an incremented * reference count. Waits on I_NEW before returning the inode. * returned with an incremented reference count. * * This is a generalized version of ilookup() for file systems where the * inode number is not sufficient for unique identification of an inode. * * Note: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; might_sleep(); again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup5); /** * ilookup - search for an inode in the inode cache * @sb: super block of file system to search * @ino: inode number to search for * * Search for the inode @ino in the inode cache, and if the inode is in the * cache, the inode is returned with an incremented reference count. */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; might_sleep(); again: inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup); /** * find_inode_nowait - find an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @match: callback used for comparisons between inodes * @data: opaque data pointer to pass to @match * * Search for the inode specified by @hashval and @data in the inode * cache, where the helper function @match will return 0 if the inode * does not match, 1 if the inode does match, and -1 if the search * should be stopped. The @match function must be responsible for * taking the i_lock spin_lock and checking i_state for an inode being * freed or being initialized, and incrementing the reference count * before returning 1. It also must not sleep, since it is called with * the inode_hash_lock spinlock held. * * This is a even more generalized version of ilookup5() when the * function must never block --- find_inode() can block in * __wait_on_freeing_inode() --- or when the caller can not increment * the reference count because the resulting iput() might cause an * inode eviction. The tradeoff is that the @match funtion must be * very carefully implemented. */ struct inode *find_inode_nowait(struct super_block *sb, unsigned long hashval, int (*match)(struct inode *, unsigned long, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *ret_inode = NULL; int mval; spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, head, i_hash) { if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); if (mval == 0) continue; if (mval == 1) ret_inode = inode; goto out; } out: spin_unlock(&inode_hash_lock); return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); /** * find_inode_rcu - find an inode in the inode cache * @sb: Super block of file system to search * @hashval: Key to hash * @test: Function to test match on an inode * @data: Data for test function * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_rcu); /** * find_inode_by_ino_rcu - Find an inode in the inode cache * @sb: Super block of file system to search * @ino: The inode number to match * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_by_ino_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_by_ino_rcu); int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); might_sleep(); while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; spin_lock(&old->i_lock); if (old->i_state & (I_FREEING|I_WILL_FREE)) { spin_unlock(&old->i_lock); continue; } break; } if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } if (unlikely(old->i_state & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); return -EBUSY; } __iget(old); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } iput(old); } } EXPORT_SYMBOL(insert_inode_locked); int insert_inode_locked4(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *old; might_sleep(); inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { iput(old); return -EBUSY; } return 0; } EXPORT_SYMBOL(insert_inode_locked4); int inode_just_drop(struct inode *inode) { return 1; } EXPORT_SYMBOL(inode_just_drop); /* * Called when we're dropping the last reference * to an inode. * * Call the FS "drop_inode()" function, defaulting to * the legacy UNIX filesystem behaviour. If it tells * us to evict inode, do so. Otherwise, retain inode * in cache if fs is alive, sync and evict if fs is * shutting down. */ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; unsigned long state; int drop; WARN_ON(inode->i_state & I_NEW); if (op->drop_inode) drop = op->drop_inode(inode); else drop = inode_generic_drop(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { __inode_add_lru(inode, true); spin_unlock(&inode->i_lock); return; } state = inode->i_state; if (!drop) { WRITE_ONCE(inode->i_state, state | I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); state = inode->i_state; WARN_ON(state & I_NEW); state &= ~I_WILL_FREE; } WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); } /** * iput - put an inode * @inode: inode to put * * Puts an inode, dropping its usage count. If the inode use count hits * zero, the inode is then freed and may also be destroyed. * * Consequently, iput() can sleep. */ void iput(struct inode *inode) { might_sleep(); if (unlikely(!inode)) return; retry: lockdep_assert_not_held(&inode->i_lock); VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); /* * Note this assert is technically racy as if the count is bogusly * equal to one, then two CPUs racing to further drop it can both * conclude it's fine. */ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); goto retry; } spin_lock(&inode->i_lock); if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { spin_unlock(&inode->i_lock); goto retry; } if (!atomic_dec_and_test(&inode->i_count)) { spin_unlock(&inode->i_lock); return; } /* * iput_final() drops ->i_lock, we can't assert on it as the inode may * be deallocated by the time the call returns. */ iput_final(inode); } EXPORT_SYMBOL(iput); #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file * @inode: inode owning the block number being requested * @block: pointer containing the block to find * * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { if (!inode->i_mapping->a_ops->bmap) return -EINVAL; *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block); return 0; } EXPORT_SYMBOL(bmap); #endif /* * With relative atime, only update atime if the previous atime is * earlier than or equal to either the ctime or mtime, * or if at least a day has passed since the last atime update. */ static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) return true; /* * Is mtime younger than or equal to atime? If yes, update atime: */ atime = inode_get_atime(inode); mtime = inode_get_mtime(inode); if (timespec64_compare(&mtime, &atime) >= 0) return true; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); if (timespec64_compare(&ctime, &atime) >= 0) return true; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) return true; /* * Good, we can skip the atime update: */ return false; } /** * inode_update_timestamps - update the timestamps on the inode * @inode: inode to be updated * @flags: S_* flags that needed to be updated * * The update_time function is called when an inode's timestamps need to be * updated for a read or write operation. This function handles updating the * actual timestamps. It's up to the caller to ensure that the inode is marked * dirty appropriately. * * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated, * attempt to update all three of them. S_ATIME updates can be handled * independently of the rest. * * Returns a set of S_* flags indicating which values changed. */ int inode_update_timestamps(struct inode *inode, int flags) { int updated = 0; struct timespec64 now; if (flags & (S_MTIME|S_CTIME|S_VERSION)) { struct timespec64 ctime = inode_get_ctime(inode); struct timespec64 mtime = inode_get_mtime(inode); now = inode_set_ctime_current(inode); if (!timespec64_equal(&now, &ctime)) updated |= S_CTIME; if (!timespec64_equal(&now, &mtime)) { inode_set_mtime_to_ts(inode, now); updated |= S_MTIME; } if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated)) updated |= S_VERSION; } else { now = current_time(inode); } if (flags & S_ATIME) { struct timespec64 atime = inode_get_atime(inode); if (!timespec64_equal(&now, &atime)) { inode_set_atime_to_ts(inode, now); updated |= S_ATIME; } } return updated; } EXPORT_SYMBOL(inode_update_timestamps); /** * generic_update_time - update the timestamps on the inode * @inode: inode to be updated * @flags: S_* flags that needed to be updated * * The update_time function is called when an inode's timestamps need to be * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME, * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME * updates can be handled done independently of the rest. * * Returns a S_* mask indicating which fields were updated. */ int generic_update_time(struct inode *inode, int flags) { int updated = inode_update_timestamps(inode, flags); int dirty_flags = 0; if (updated & (S_ATIME|S_MTIME|S_CTIME)) dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC; if (updated & S_VERSION) dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags); return updated; } EXPORT_SYMBOL(generic_update_time); /* * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ int inode_update_time(struct inode *inode, int flags) { if (inode->i_op->update_time) return inode->i_op->update_time(inode, flags); generic_update_time(inode, flags); return 0; } EXPORT_SYMBOL(inode_update_time); /** * atime_needs_update - update the access time * @path: the &struct path to update * @inode: inode to update * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; struct timespec64 now, atime; if (inode->i_flags & S_NOATIME) return false; /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode)) return false; if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; if (mnt->mnt_flags & MNT_NOATIME) return false; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; now = current_time(inode); if (!relatime_need_update(mnt, inode, now)) return false; atime = inode_get_atime(inode); if (timespec64_equal(&atime, &now)) return false; return true; } void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); if (!atime_needs_update(path, inode)) return; if (!sb_start_write_trylock(inode->i_sb)) return; if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to * allocate new space to modify an inode (such is the case for * Btrfs), but since we touch atime while walking down the path we * really don't care if we failed to update the atime of the file, * so just ignore the return value. * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ inode_update_time(inode, S_ATIME); mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); /* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ int dentry_needs_remove_privs(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int mask = 0; int ret; if (IS_NOSEC(inode)) return 0; mask = setattr_should_drop_suidgid(idmap, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; if (ret) mask |= ATTR_KILL_PRIV; return mask; } static int __remove_privs(struct mnt_idmap *idmap, struct dentry *dentry, int kill) { struct iattr newattrs; newattrs.ia_valid = ATTR_FORCE | kill; /* * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ return notify_change(idmap, dentry, &newattrs, NULL); } static int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); int error = 0; int kill; if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry); if (kill < 0) return kill; if (kill) { if (flags & IOCB_NOWAIT) return -EAGAIN; error = __remove_privs(file_mnt_idmap(file), dentry, kill); } if (!error) inode_has_no_xattr(inode); return error; } /** * file_remove_privs - remove special file privileges (suid, capabilities) * @file: file to remove privileges from * * When file is modified by a write or truncation ensure that special * file privileges are removed. * * Return: 0 on success, negative errno on failure. */ int file_remove_privs(struct file *file) { return file_remove_privs_flags(file, 0); } EXPORT_SYMBOL(file_remove_privs); /** * current_time - Return FS time (possibly fine-grained) * @inode: inode. * * Return the current time truncated to the time granularity supported by * the fs, as suitable for a ctime/mtime change. If the ctime is flagged * as having been QUERIED, get a fine-grained timestamp, but don't update * the floor. * * For a multigrain inode, this is effectively an estimate of the timestamp * that a file would receive. An actual update must go through * inode_set_ctime_current(). */ struct timespec64 current_time(struct inode *inode) { struct timespec64 now; u32 cns; ktime_get_coarse_real_ts64_mg(&now); if (!is_mgtime(inode)) goto out; /* If nothing has queried it, then coarse time is fine */ cns = smp_load_acquire(&inode->i_ctime_nsec); if (cns & I_CTIME_QUERIED) { /* * If there is no apparent change, then get a fine-grained * timestamp. */ if (now.tv_nsec == (cns & ~I_CTIME_QUERIED)) ktime_get_real_ts64(&now); } out: return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); static int inode_needs_update_time(struct inode *inode) { struct timespec64 now, ts; int sync_it = 0; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; now = current_time(inode); ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) sync_it |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; return sync_it; } static int __file_update_time(struct file *file, int sync_mode) { int ret = 0; struct inode *inode = file_inode(file); /* try to update time settings */ if (!mnt_get_write_access_file(file)) { ret = inode_update_time(inode, sync_mode); mnt_put_write_access_file(file); } return ret; } /** * file_update_time - update mtime and ctime time * @file: file accessed * * Update the mtime and ctime members of an inode and mark the inode for * writeback. Note that this function is meant exclusively for usage in * the file write path of filesystems, and filesystems may choose to * explicitly ignore updates via this function with the _NOCMTIME inode * flag, e.g. for network filesystem where these imestamps are handled * by the server. This can return an error for file systems who need to * allocate space in order to update an inode. * * Return: 0 on success, negative errno on failure. */ int file_update_time(struct file *file) { int ret; struct inode *inode = file_inode(file); ret = inode_needs_update_time(inode); if (ret <= 0) return ret; return __file_update_time(file, ret); } EXPORT_SYMBOL(file_update_time); /** * file_modified_flags - handle mandated vfs changes when modifying a file * @file: file that was modified * @flags: kiocb flags * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * If IOCB_NOWAIT is set, special file privileges will not be removed and * time settings will not be updated. It will return -EAGAIN. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ ret = file_remove_privs_flags(file, flags); if (ret) return ret; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; ret = inode_needs_update_time(inode); if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; return __file_update_time(file, ret); } /** * file_modified - handle mandated vfs changes when modifying a file * @file: file that was modified * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ int file_modified(struct file *file) { return file_modified_flags(file, 0); } EXPORT_SYMBOL(file_modified); /** * kiocb_modified - handle mandated vfs changes when modifying a file * @iocb: iocb that was modified * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ int kiocb_modified(struct kiocb *iocb) { return file_modified_flags(iocb->ki_filp, iocb->ki_flags); } EXPORT_SYMBOL_GPL(kiocb_modified); int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) return 1; if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) return 1; return 0; } EXPORT_SYMBOL(inode_needs_sync); /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its * deletion before reporting that it isn't found. This function waits * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; /* * Handle racing against evict(), see that routine for more details. */ if (unlikely(inode_unhashed(inode))) { WARN_ON(is_inode_hash_locked); spin_unlock(&inode->i_lock); return; } wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq_head, &wqe.wq_entry); if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); } static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { if (!str) return 0; ihash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("ihash_entries=", set_ihash_entries); /* * Initialize the waitqueues and inode hash table. */ void __init inode_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void __init inode_init(void) { /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ if (!hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; switch (inode->i_mode & S_IFMT) { case S_IFCHR: inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; break; case S_IFBLK: if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; break; case S_IFIFO: inode->i_fop = &pipefifo_fops; break; case S_IFSOCK: /* leave it no_open_fops */ break; default: printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); break; } } EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards * @idmap: idmap of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode * * If the inode has been created through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions * and initializing i_uid and i_gid. On non-idmapped mounts or if permission * checking is to be performed on the raw inode simply pass @nop_mnt_idmap. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode) { inode_fsuid_set(inode, idmap); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; } else inode_fsgid_set(inode, idmap); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode * @idmap: idmap of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) { vfsuid_t vfsuid; struct user_namespace *ns; vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; ns = current_user_ns(); if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } EXPORT_SYMBOL(inode_owner_or_capable); /* * Direct i/o helper functions */ bool inode_dio_finished(const struct inode *inode) { return atomic_read(&inode->i_dio_count) == 0; } EXPORT_SYMBOL(inode_dio_finished); /** * inode_dio_wait - wait for outstanding DIO requests to finish * @inode: inode to wait for * * Waits for all pending direct I/O requests to finish so that we can * proceed with a truncate or equivalent operation. * * Must be called under a lock that serializes taking new references * to i_dio_count, usually by inode->i_rwsem. */ void inode_dio_wait(struct inode *inode) { wait_var_event(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait); void inode_dio_wait_interruptible(struct inode *inode) { wait_var_event_interruptible(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait_interruptible); /* * inode_set_flags - atomically set some inode flags * * Note: the caller should be holding i_rwsem exclusively, or else be sure that * they have exclusive access to the inode structure (i.e., while the * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one * code path which doesn't today so we use cmpxchg() out of an abundance * of caution. * * In the long run, i_rwsem is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure * it is so documented in include/linux/fs.h and that all code follows * the locking convention!! */ void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { WARN_ON_ONCE(flags & ~mask); set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); void inode_nohighmem(struct inode *inode) { mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } EXPORT_SYMBOL(inode_nohighmem); struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { trace_inode_set_ctime_to_ts(inode, &ts); set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); inode->i_ctime_sec = ts.tv_sec; inode->i_ctime_nsec = ts.tv_nsec; return ts; } EXPORT_SYMBOL(inode_set_ctime_to_ts); /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec * @inode: inode being updated * * Truncate a timespec to the granularity supported by the fs * containing the inode. Always rounds down. gran must * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned int gran = sb->s_time_gran; t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) t.tv_nsec = 0; /* Avoid division in the common cases 1 ns and 1 s. */ if (gran == 1) ; /* nothing */ else if (gran == NSEC_PER_SEC) t.tv_nsec = 0; else if (gran > 1 && gran < NSEC_PER_SEC) t.tv_nsec -= t.tv_nsec % gran; else WARN(1, "invalid file time granularity: %u", gran); return t; } EXPORT_SYMBOL(timestamp_truncate); /** * inode_set_ctime_current - set the ctime to current_time * @inode: inode * * Set the inode's ctime to the current value for the inode. Returns the * current value that was assigned. If this is not a multigrain inode, then we * set it to the later of the coarse time and floor value. * * If it is multigrain, then we first see if the coarse-grained timestamp is * distinct from what is already there. If so, then use that. Otherwise, get a * fine-grained timestamp. * * After that, try to swap the new value into i_ctime_nsec. Accept the * resulting ctime, regardless of the outcome of the swap. If it has * already been replaced, then that timestamp is later than the earlier * unacceptable one, and is thus acceptable. */ struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now; u32 cns, cur; ktime_get_coarse_real_ts64_mg(&now); now = timestamp_truncate(now, inode); /* Just return that if this is not a multigrain fs */ if (!is_mgtime(inode)) { inode_set_ctime_to_ts(inode, now); goto out; } /* * A fine-grained time is only needed if someone has queried * for timestamps, and the current coarse grained time isn't * later than what's already there. */ cns = smp_load_acquire(&inode->i_ctime_nsec); if (cns & I_CTIME_QUERIED) { struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec, .tv_nsec = cns & ~I_CTIME_QUERIED }; if (timespec64_compare(&now, &ctime) <= 0) { ktime_get_real_ts64_mg(&now); now = timestamp_truncate(now, inode); mgtime_counter_inc(mg_fine_stamps); } } mgtime_counter_inc(mg_ctime_updates); /* No need to cmpxchg if it's exactly the same */ if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) { trace_ctime_xchg_skip(inode, &now); goto out; } cur = cns; retry: /* Try to swap the nsec value into place. */ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) { /* If swap occurred, then we're (mostly) done */ inode->i_ctime_sec = now.tv_sec; trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur); mgtime_counter_inc(mg_ctime_swaps); } else { /* * Was the change due to someone marking the old ctime QUERIED? * If so then retry the swap. This can only happen once since * the only way to clear I_CTIME_QUERIED is to stamp the inode * with a new ctime. */ if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) { cns = cur; goto retry; } /* Otherwise, keep the existing ctime */ now.tv_sec = inode->i_ctime_sec; now.tv_nsec = cur & ~I_CTIME_QUERIED; } out: return now; } EXPORT_SYMBOL(inode_set_ctime_current); /** * inode_set_ctime_deleg - try to update the ctime on a delegated inode * @inode: inode to update * @update: timespec64 to set the ctime * * Attempt to atomically update the ctime on behalf of a delegation holder. * * The nfs server can call back the holder of a delegation to get updated * inode attributes, including the mtime. When updating the mtime, update * the ctime to a value at least equal to that. * * This can race with concurrent updates to the inode, in which * case the update is skipped. * * Note that this works even when multigrain timestamps are not enabled, * so it is used in either case. */ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update) { struct timespec64 now, cur_ts; u32 cur, old; /* pairs with try_cmpxchg below */ cur = smp_load_acquire(&inode->i_ctime_nsec); cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; cur_ts.tv_sec = inode->i_ctime_sec; /* If the update is older than the existing value, skip it. */ if (timespec64_compare(&update, &cur_ts) <= 0) return cur_ts; ktime_get_coarse_real_ts64_mg(&now); /* Clamp the update to "now" if it's in the future */ if (timespec64_compare(&update, &now) > 0) update = now; update = timestamp_truncate(update, inode); /* No need to update if the values are already the same */ if (timespec64_equal(&update, &cur_ts)) return cur_ts; /* * Try to swap the nsec value into place. If it fails, that means * it raced with an update due to a write or similar activity. That * stamp takes precedence, so just skip the update. */ retry: old = cur; if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) { inode->i_ctime_sec = update.tv_sec; mgtime_counter_inc(mg_ctime_swaps); return update; } /* * Was the change due to another task marking the old ctime QUERIED? * * If so, then retry the swap. This can only happen once since * the only way to clear I_CTIME_QUERIED is to stamp the inode * with a new ctime. */ if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED))) goto retry; /* Otherwise, it was a new timestamp. */ cur_ts.tv_sec = inode->i_ctime_sec; cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; return cur_ts; } EXPORT_SYMBOL(inode_set_ctime_deleg); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged * @idmap: idmap of the mount @inode was found from * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * * Return: true if the caller is sufficiently privileged, false if not. */ bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid) { if (vfsgid_in_group_p(vfsgid)) return true; if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) return true; return false; } EXPORT_SYMBOL(in_group_or_capable); /** * mode_strip_sgid - handle the sgid bit for non-directories * @idmap: idmap of the mount the inode was created from * @dir: parent directory inode * @mode: mode of the file to be created in @dir * * If the @mode of the new file has both the S_ISGID and S_IXGRP bit * raised and @dir has the S_ISGID bit raised ensure that the caller is * either in the group of the parent directory or they have CAP_FSETID * in their user namespace and are privileged over the parent directory. * In all other cases, strip the S_ISGID bit from @mode. * * Return: the new mode to use for the file */ umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode) { if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir))) return mode; return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); #ifdef CONFIG_DEBUG_VFS /* * Dump an inode. * * TODO: add a proper inode dumping routine, this is a stub to get debug off the * ground. * * TODO: handle getting to fs type with get_kernel_nofault()? * See dump_mapping() above. */ void dump_inode(struct inode *inode, const char *reason) { struct super_block *sb = inode->i_sb; pr_warn("%s encountered for inode %px\n" "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); #endif
5 2 5 5 5 5 5 2 2 2 2 5 2 5 5 5 5 5 5 5 5 5 2 2 5 2 2 1 2 2 2 1 2 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Priority handling * RFC DRAFT ndata section 3.4 */ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); static struct sctp_stream_priorities *sctp_sched_prio_head_get(struct sctp_stream_priorities *p) { p->users++; return p; } static void sctp_sched_prio_head_put(struct sctp_stream_priorities *p) { if (p && --p->users == 0) kfree(p); } static struct sctp_stream_priorities *sctp_sched_prio_new_head( struct sctp_stream *stream, int prio, gfp_t gfp) { struct sctp_stream_priorities *p; p = kmalloc(sizeof(*p), gfp); if (!p) return NULL; INIT_LIST_HEAD(&p->prio_sched); INIT_LIST_HEAD(&p->active); p->next = NULL; p->prio = prio; p->users = 1; return p; } static struct sctp_stream_priorities *sctp_sched_prio_get_head( struct sctp_stream *stream, int prio, gfp_t gfp) { struct sctp_stream_priorities *p; int i; /* Look into scheduled priorities first, as they are sorted and * we can find it fast IF it's scheduled. */ list_for_each_entry(p, &stream->prio_list, prio_sched) { if (p->prio == prio) return sctp_sched_prio_head_get(p); if (p->prio > prio) break; } /* No luck. So we search on all streams now. */ for (i = 0; i < stream->outcnt; i++) { if (!SCTP_SO(stream, i)->ext) continue; p = SCTP_SO(stream, i)->ext->prio_head; if (!p) /* Means all other streams won't be initialized * as well. */ break; if (p->prio == prio) return sctp_sched_prio_head_get(p); } /* If not even there, allocate a new one. */ return sctp_sched_prio_new_head(stream, prio, gfp); } static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p) { struct list_head *pos; pos = p->next->prio_list.next; if (pos == &p->active) pos = pos->next; p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list); } static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute) { bool scheduled = false; if (!list_empty(&soute->prio_list)) { struct sctp_stream_priorities *prio_head = soute->prio_head; /* Scheduled */ scheduled = true; if (prio_head->next == soute) /* Try to move to the next stream */ sctp_sched_prio_next_stream(prio_head); list_del_init(&soute->prio_list); /* Also unsched the priority if this was the last stream */ if (list_empty(&prio_head->active)) { list_del_init(&prio_head->prio_sched); /* If there is no stream left, clear next */ prio_head->next = NULL; } } return scheduled; } static void sctp_sched_prio_sched(struct sctp_stream *stream, struct sctp_stream_out_ext *soute) { struct sctp_stream_priorities *prio, *prio_head; prio_head = soute->prio_head; /* Nothing to do if already scheduled */ if (!list_empty(&soute->prio_list)) return; /* Schedule the stream. If there is a next, we schedule the new * one before it, so it's the last in round robin order. * If there isn't, we also have to schedule the priority. */ if (prio_head->next) { list_add(&soute->prio_list, prio_head->next->prio_list.prev); return; } list_add(&soute->prio_list, &prio_head->active); prio_head->next = soute; list_for_each_entry(prio, &stream->prio_list, prio_sched) { if (prio->prio > prio_head->prio) { list_add(&prio_head->prio_sched, prio->prio_sched.prev); return; } } list_add_tail(&prio_head->prio_sched, &stream->prio_list); } static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, __u16 prio, gfp_t gfp) { struct sctp_stream_out *sout = SCTP_SO(stream, sid); struct sctp_stream_out_ext *soute = sout->ext; struct sctp_stream_priorities *prio_head, *old; bool reschedule = false; old = soute->prio_head; if (old && old->prio == prio) return 0; prio_head = sctp_sched_prio_get_head(stream, prio, gfp); if (!prio_head) return -ENOMEM; reschedule = sctp_sched_prio_unsched(soute); soute->prio_head = prio_head; if (reschedule) sctp_sched_prio_sched(stream, soute); sctp_sched_prio_head_put(old); return 0; } static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = SCTP_SO(stream, sid)->ext->prio_head->prio; return 0; } static int sctp_sched_prio_init(struct sctp_stream *stream) { INIT_LIST_HEAD(&stream->prio_list); return 0; } static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->prio_list); return sctp_sched_prio_set(stream, sid, 0, gfp); } static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid) { sctp_sched_prio_head_put(SCTP_SO(stream, sid)->ext->prio_head); SCTP_SO(stream, sid)->ext->prio_head = NULL; } static void sctp_sched_prio_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { struct sctp_stream *stream; struct sctp_chunk *ch; __u16 sid; ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; sctp_sched_prio_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_priorities *prio; struct sctp_stream_out_ext *soute; struct sctp_chunk *ch = NULL; /* Bail out quickly if queue is empty */ if (list_empty(&q->out_chunk_list)) goto out; /* Find which chunk is next. It's easy, it's either the current * one or the first chunk on the next active stream. */ if (stream->out_curr) { soute = stream->out_curr->ext; } else { prio = list_entry(stream->prio_list.next, struct sctp_stream_priorities, prio_sched); soute = prio->next; } ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_priorities *prio; struct sctp_stream_out_ext *soute; __u16 sid; /* Last chunk on that msg, move to the next stream on * this priority. */ sid = sctp_chunk_stream_no(ch); soute = SCTP_SO(&q->asoc->stream, sid)->ext; prio = soute->prio_head; sctp_sched_prio_next_stream(prio); if (list_empty(&soute->outq)) sctp_sched_prio_unsched(soute); } static void sctp_sched_prio_sched_all(struct sctp_stream *stream) { struct sctp_association *asoc; struct sctp_stream_out *sout; struct sctp_chunk *ch; asoc = container_of(stream, struct sctp_association, stream); list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { __u16 sid; sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(stream, sid); if (sout->ext) sctp_sched_prio_sched(stream, sout->ext); } } static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) { struct sctp_stream_priorities *p, *tmp; struct sctp_stream_out_ext *soute, *souttmp; list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched) list_for_each_entry_safe(soute, souttmp, &p->active, prio_list) sctp_sched_prio_unsched(soute); } static struct sctp_sched_ops sctp_sched_prio = { .set = sctp_sched_prio_set, .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, .init_sid = sctp_sched_prio_init_sid, .free_sid = sctp_sched_prio_free_sid, .enqueue = sctp_sched_prio_enqueue, .dequeue = sctp_sched_prio_dequeue, .dequeue_done = sctp_sched_prio_dequeue_done, .sched_all = sctp_sched_prio_sched_all, .unsched_all = sctp_sched_prio_unsched_all, }; void sctp_sched_ops_prio_init(void) { sctp_sched_ops_register(SCTP_SS_PRIO, &sctp_sched_prio); }
7 7 7 7 7 7 1 6 7 7 7 7 22 22 22 22 22 7 7 7 22 22 22 22 22 22 22 22 22 22 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 // SPDX-License-Identifier: GPL-2.0-only /* * Scanning implementation * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/sch_generic.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/random.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "mesh.h" #define IEEE80211_PROBE_DELAY (HZ / 33) #define IEEE80211_CHANNEL_TIME (HZ / 33) #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 9) void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss) { if (!bss) return; cfg80211_put_bss(local->hw.wiphy, container_of((void *)bss, struct cfg80211_bss, priv)); } static bool is_uapsd_supported(struct ieee802_11_elems *elems) { u8 qos_info; if (elems->wmm_info && elems->wmm_info_len == 7 && elems->wmm_info[5] == 1) qos_info = elems->wmm_info[6]; else if (elems->wmm_param && elems->wmm_param_len == 24 && elems->wmm_param[5] == 1) qos_info = elems->wmm_param[6]; else /* no valid wmm information or parameter element found */ return false; return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; } struct inform_bss_update_data { struct ieee80211_rx_status *rx_status; bool beacon; }; void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *cbss, const struct cfg80211_bss_ies *ies, void *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct inform_bss_update_data *update_data = data; struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; int clen, srlen; /* This happens while joining an IBSS */ if (!update_data) return; elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); if (!elems) return; rx_status = update_data->rx_status; if (update_data->beacon) bss->device_ts_beacon = rx_status->device_timestamp; else bss->device_ts_presp = rx_status->device_timestamp; if (elems->parse_error) { if (update_data->beacon) bss->corrupt_data |= IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data |= IEEE80211_BSS_CORRUPT_PROBE_RESP; } else { if (update_data->beacon) bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_PROBE_RESP; } /* save the ERP value so that it is available at association time */ if (elems->erp_info && (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) { bss->erp_value = elems->erp_info[0]; bss->has_erp_value = true; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_ERP; } /* replace old supported rates if we get new values */ if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_RATES)) { srlen = 0; if (elems->supp_rates) { clen = IEEE80211_MAX_SUPP_RATES; if (clen > elems->supp_rates_len) clen = elems->supp_rates_len; memcpy(bss->supp_rates, elems->supp_rates, clen); srlen += clen; } if (elems->ext_supp_rates) { clen = IEEE80211_MAX_SUPP_RATES - srlen; if (clen > elems->ext_supp_rates_len) clen = elems->ext_supp_rates_len; memcpy(bss->supp_rates + srlen, elems->ext_supp_rates, clen); srlen += clen; } if (srlen) { bss->supp_rates_len = srlen; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_RATES; } } if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_WMM)) { bss->wmm_used = elems->wmm_param || elems->wmm_info; bss->uapsd_supported = is_uapsd_supported(elems); if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_WMM; } if (update_data->beacon) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[rx_status->band]; if (!(rx_status->encoding == RX_ENC_HT) && !(rx_status->encoding == RX_ENC_VHT)) bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } if (elems->vht_cap_elem) bss->vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); else bss->vht_cap_info = 0; kfree(elems); } struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel) { bool beacon = ieee80211_is_beacon(mgmt->frame_control) || ieee80211_is_s1g_beacon(mgmt->frame_control); struct cfg80211_bss *cbss; struct inform_bss_update_data update_data = { .rx_status = rx_status, .beacon = beacon, }; struct cfg80211_inform_bss bss_meta = { .boottime_ns = rx_status->boottime_ns, .drv_data = (void *)&update_data, }; bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL) bss_meta.signal = 0; /* invalid signal indication */ else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; bss_meta.chan = channel; rcu_read_lock(); scan_sdata = rcu_dereference(local->scan_sdata); if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && scan_sdata->vif.cfg.assoc && ieee80211_have_rx_timestamp(rx_status)) { struct ieee80211_bss_conf *link_conf = NULL; /* for an MLO connection, set the TSF data only in case we have * an indication on which of the links the frame was received */ if (ieee80211_vif_is_mld(&scan_sdata->vif)) { if (rx_status->link_valid) { s8 link_id = rx_status->link_id; link_conf = rcu_dereference(scan_sdata->vif.link_conf[link_id]); } } else { link_conf = &scan_sdata->vif.bss_conf; } if (link_conf) { bss_meta.parent_tsf = ieee80211_calculate_rx_timestamp(local, rx_status, len + FCS_LEN, 24); ether_addr_copy(bss_meta.parent_bssid, link_conf->bssid); } } rcu_read_unlock(); cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) return NULL; /* In case the signal is invalid update the status */ signal_valid = channel == cbss->channel; if (!signal_valid) rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; return (void *)cbss->priv; } static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 scan_flags, const u8 *da) { struct ieee80211_link_data *link_sdata; u8 link_id; if (!sdata) return false; /* accept broadcast on 6 GHz and for OCE */ if (is_broadcast_ether_addr(da) && (channel->band == NL80211_BAND_6GHZ || scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP)) return true; if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; if (ether_addr_equal(da, sdata->vif.addr)) return true; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link_sdata = rcu_dereference(sdata->link[link_id]); if (!link_sdata) continue; if (ether_addr_equal(da, link_sdata->conf->addr)) return true; } return false; } void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; struct ieee80211_ext *ext; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); if (!ieee80211_is_probe_resp(mgmt->frame_control) && !ieee80211_is_beacon(mgmt->frame_control) && !ieee80211_is_s1g_beacon(mgmt->frame_control)) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (struct ieee80211_ext *)mgmt; min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + ieee80211_s1g_optional_len(ext->frame_control); } if (skb->len < min_hdr_len) return; if (test_and_clear_bit(SCAN_BEACON_WAIT, &local->scanning)) { /* * we were passive scanning because of radar/no-IR, but * the beacon/proberesp rx gives us an opportunity to upgrade * to active scan */ set_bit(SCAN_BEACON_DONE, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct ieee80211_sub_if_data *sdata1, *sdata2; struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; u32 scan_req_flags = 0, sched_scan_req_flags = 0; sdata1 = rcu_dereference(local->scan_sdata); sdata2 = rcu_dereference(local->sched_scan_sdata); if (likely(!sdata1 && !sdata2)) return; scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); if (scan_req) scan_req_flags = scan_req->flags; if (sched_scan_req) sched_scan_req_flags = sched_scan_req->flags; /* ignore ProbeResp to foreign address or non-bcast (OCE) * unless scanning with randomised address */ if (!ieee80211_scan_accept_presp(sdata1, channel, scan_req_flags, mgmt->da) && !ieee80211_scan_accept_presp(sdata2, channel, sched_scan_req_flags, mgmt->da)) return; } else { /* Beacons are expected only with broadcast address */ if (!is_broadcast_ether_addr(mgmt->da)) return; } /* Do not update the BSS table in case of only monitor interfaces */ if (local->open_count == local->monitors) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, channel); if (bss) ieee80211_rx_bss_put(local, bss); } static void ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef) { memset(chandef, 0, sizeof(*chandef)); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; } /* return false if no more work */ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen; u32 *n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { local->hw_scan_req->req.n_channels = req->n_channels; for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; n_chans = &local->hw_scan_req->req.n_channels; *n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; local->hw_scan_req->req.channels[(*n_chans)++] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; } while (!*n_chans); } ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, bands_used, req->rates, &chandef, flags); if (ielen < 0) return false; local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); ether_addr_copy(local->hw_scan_req->req.mac_addr_mask, req->mac_addr_mask); ether_addr_copy(local->hw_scan_req->req.bssid, req->bssid); return true; } static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); bool hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * It's ok to abort a not-yet-running scan (that * we have one at all will be verified by checking * local->scan_req next), but not to complete it * successfully. */ if (WARN_ON(!local->scanning && !aborted)) aborted = true; if (WARN_ON(!local->scan_req)) return; scan_sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)), local->hw_scan_req); if (rc == 0) return; /* HW scan failed and is going to be reported as aborted, * so clear old scan info. */ memset(&local->scan_info, 0, sizeof(local->scan_info)); aborted = true; } kfree(local->hw_scan_req); local->hw_scan_req = NULL; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; synchronize_rcu(); if (scan_req != local->int_scan_req) { local->scan_info.aborted = aborted; cfg80211_scan_done(scan_req, &local->scan_info); } /* Set power back to normal operating levels. */ ieee80211_hw_conf_chan(local); if (!hw_scan && was_scanning) { ieee80211_configure_filter(local); drv_sw_scan_complete(local, scan_sdata); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); /* Requeue all the work that might have been ignored while * the scan was in progress; if there was none this will * just be a no-op for the particular interface. */ list_for_each_entry(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } if (was_scanning) ieee80211_start_next_roc(local); } void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); trace_api_scan_completed(local, info->aborted); set_bit(SCAN_COMPLETED, &local->scanning); if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); memcpy(&local->scan_info, info, sizeof(*info)); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); static int ieee80211_start_sw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { /* Software scan is not supported in multi-channel cases */ if (!local->emulate_chanctx) return -EOPNOTSUPP; /* * Hardware/driver doesn't support hw_scan, so use software * scanning instead. First send a nullfunc frame with power save * bit on so that AP will buffer the frames for us while we are not * listening, then send probe requests to each channel and wait for * the responses. After all channels are scanned, tune back to the * original channel and send a nullfunc frame with power save bit * off to trigger the AP to send us all the buffered frames. * * Note that while local->sw_scanning is true everything else but * nullfunc frames and probe requests will be dropped in * ieee80211_tx_h_check_assoc(). */ drv_sw_scan_start(local, sdata, local->scan_addr); local->leave_oper_channel_time = jiffies; local->next_scan_state = SCAN_DECISION; local->scan_channel_idx = 0; ieee80211_offchannel_stop_vifs(local); /* ensure nullfunc is transmitted before leaving operating channel */ ieee80211_flush_queues(local, NULL, false); ieee80211_configure_filter(local); /* We need to set power level at maximum rate for scanning. */ ieee80211_hw_conf_chan(local); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); return 0; } static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_is_radar_required(local, req)) return true; if (!regulatory_pre_cac_allowed(local->hw.wiphy)) return false; list_for_each_entry(sdata_iter, &local->interfaces, list) { for_each_valid_link(&sdata_iter->wdev, link_id) if (sdata_iter->wdev.links[link_id].cac_started) return false; } return true; } static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { if (!__ieee80211_can_leave_ch(sdata, req)) return false; if (!list_empty(&local->roc_list)) return false; if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.flags & IEEE80211_STA_CONNECTION_POLL) return false; return true; } void ieee80211_run_deferred_scan(struct ieee80211_local *local) { struct cfg80211_scan_request *req; lockdep_assert_wiphy(local->hw.wiphy); if (!local->scan_req || local->scanning) return; req = wiphy_dereference(local->hw.wiphy, local->scan_req); if (!ieee80211_can_scan(local, rcu_dereference_protected( local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)), req)) return; wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, round_jiffies_relative(0)); } static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 ratemask, u32 flags, u32 tx_flags, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, ie, ie_len, flags); if (skb) { if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 sn = get_random_u16(); info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { int i; struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); for (i = 0; i < scan_req->n_ssids; i++) ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, scan_req->rates[band], flags, tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses * on the channel. */ *next_delay = msecs_to_jiffies(scan_req->duration) > IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME ? msecs_to_jiffies(scan_req->duration) - IEEE80211_PROBE_DELAY : IEEE80211_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; } static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; bool hw_scan = local->ops->hw_scan; int rc; lockdep_assert_wiphy(local->hw.wiphy); if (local->scan_req) return -EBUSY; /* For an MLO connection, if a link ID was specified, validate that it * is indeed active. */ if (ieee80211_vif_is_mld(&sdata->vif) && req->tsf_report_link_id >= 0 && !(sdata->vif.active_links & BIT(req->tsf_report_link_id))) return -EINVAL; if (!__ieee80211_can_leave_ch(sdata, req)) return -EBUSY; if (!ieee80211_can_scan(local, sdata, req)) { /* wait for the work to finish/time out */ rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); return 0; } again: if (hw_scan) { u8 *ies; local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; for (i = 0; i < req->n_channels; i++) { if (bands_counted & BIT(req->channels[i]->band)) continue; bands_counted |= BIT(req->channels[i]->band); n_bands++; } local->hw_scan_ies_bufsize *= n_bands; } local->hw_scan_req = kmalloc(struct_size(local->hw_scan_req, req.channels, req->n_channels) + local->hw_scan_ies_bufsize, GFP_KERNEL); if (!local->hw_scan_req) return -ENOMEM; local->hw_scan_req->req.ssids = req->ssids; local->hw_scan_req->req.n_ssids = req->n_ssids; /* None of the channels are actually set * up but let UBSAN know the boundaries. */ local->hw_scan_req->req.n_channels = req->n_channels; ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); local->hw_scan_req->req.duration = req->duration; local->hw_scan_req->req.duration_mandatory = req->duration_mandatory; local->hw_scan_req->req.tsf_report_link_id = req->tsf_report_link_id; local->hw_scan_band = 0; local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; local->hw_scan_req->req.scan_6ghz_params = req->scan_6ghz_params; local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; local->hw_scan_req->req.first_part = req->first_part; /* * After allocating local->hw_scan_req, we must * go through until ieee80211_prep_hw_scan(), so * anything that might be changed here and leave * this function early must not go after this * allocation. */ } rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) get_random_mask_addr(local->scan_addr, req->mac_addr, req->mac_addr_mask); else memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN); if (hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && (req->channels[0] == local->hw.conf.chandef.chan)) { /* * If we are scanning only on the operating channel * then we do not need to stop normal activities */ unsigned long next_delay; __set_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); ieee80211_recalc_idle(local); /* Notify driver scan is starting, keep order of operations * same as normal software scan, in case that matters. */ drv_sw_scan_start(local, sdata, local->scan_addr); ieee80211_configure_filter(local); /* accept probe-responses */ /* We need to ensure power level is at max for scanning. */ ieee80211_hw_conf_chan(local); if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !req->n_ssids) { next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; if (req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); } else { ieee80211_scan_state_send_probe(local, &next_delay); next_delay = IEEE80211_CHANNEL_TIME; } /* Now, just wait a bit and we are all done! */ wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return 0; } else { /* Do normal software scan */ __set_bit(SCAN_SW_SCANNING, &local->scanning); } ieee80211_recalc_idle(local); if (hw_scan) { WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); } if (rc) { kfree(local->hw_scan_req); local->hw_scan_req = NULL; local->scanning = 0; ieee80211_recalc_idle(local); local->scan_req = NULL; RCU_INIT_POINTER(local->scan_sdata, NULL); } if (hw_scan && rc == 1) { /* * we can't fall back to software for P2P-GO * as it must update NoA etc. */ if (ieee80211_vif_type_p2p(&sdata->vif) == NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; hw_scan = false; goto again; } return rc; } static unsigned long ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) { /* * TODO: channel switching also consumes quite some time, * add that delay as well to get a better estimation */ if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) return IEEE80211_PASSIVE_CHANNEL_TIME; return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } static void ieee80211_scan_state_decision(struct ieee80211_local *local, unsigned long *next_delay) { bool associated = false; bool tx_empty = true; bool bad_latency; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *next_chan; enum mac80211_scan_state next_scan_state; struct cfg80211_scan_request *scan_req; lockdep_assert_wiphy(local->hw.wiphy); /* * check if at least one STA interface is associated, * check if at least one STA interface has pending tx frames * and grab the lowest used beacon interval */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (sdata->u.mgd.associated) { associated = true; if (!qdisc_all_tx_empty(sdata->dev)) { tx_empty = false; break; } } } } scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); next_chan = scan_req->channels[local->scan_channel_idx]; /* * we're currently scanning a different channel, let's * see if we can scan another channel without interfering * with the current traffic situation. * * Keep good latency, do not stay off-channel more than 125 ms. */ bad_latency = time_after(jiffies + ieee80211_scan_get_channel_time(next_chan), local->leave_oper_channel_time + HZ / 8); if (associated && !tx_empty) { if (scan_req->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) next_scan_state = SCAN_ABORT; else next_scan_state = SCAN_SUSPEND; } else if (associated && bad_latency) { next_scan_state = SCAN_SUSPEND; } else { next_scan_state = SCAN_SET_CHANNEL; } local->next_scan_state = next_scan_state; *next_delay = 0; } static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, unsigned long *next_delay) { int skip; struct ieee80211_channel *chan; struct cfg80211_scan_request *scan_req; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); skip = 0; chan = scan_req->channels[local->scan_channel_idx]; local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; /* For S1G, only scan the 1MHz primaries. */ if (chan->band == NL80211_BAND_S1GHZ) { local->scan_chandef.width = NL80211_CHAN_WIDTH_1; local->scan_chandef.s1g_primary_2mhz = false; goto set_channel; } /* * If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) local->scan_chandef = local->hw.conf.chandef; else local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; set_channel: if (ieee80211_hw_conf_chan(local)) skip = 1; /* advance state machine to next channel/band */ local->scan_channel_idx++; if (skip) { /* if we skip this channel return to the decision state */ local->next_scan_state = SCAN_DECISION; return; } /* * Probe delay is used to update the NAV, cf. 11.1.3.2.2 * (which unfortunately doesn't say _why_ step a) is done, * but it waits for the probe delay or until a frame is * received - and the received frame would update the NAV). * For now, we do not support waiting until a frame is * received. * * In any case, it is not necessary for a passive scan. */ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !scan_req->n_ssids) { *next_delay = max(msecs_to_jiffies(scan_req->duration), IEEE80211_PASSIVE_CHANNEL_TIME); local->next_scan_state = SCAN_DECISION; if (scan_req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); return; } /* active scan, send probes */ *next_delay = IEEE80211_PROBE_DELAY; local->next_scan_state = SCAN_SEND_PROBE; } static void ieee80211_scan_state_suspend(struct ieee80211_local *local, unsigned long *next_delay) { /* switch back to the operating channel */ local->scan_chandef.chan = NULL; ieee80211_hw_conf_chan(local); /* disable PS */ ieee80211_offchannel_return(local); *next_delay = HZ / 5; /* afterwards, resume scan & go to next channel */ local->next_scan_state = SCAN_RESUME; } static void ieee80211_scan_state_resume(struct ieee80211_local *local, unsigned long *next_delay) { ieee80211_offchannel_stop_vifs(local); if (local->ops->flush) { ieee80211_flush_queues(local, NULL, false); *next_delay = 0; } else *next_delay = HZ / 10; /* remember when we left the operating channel */ local->leave_oper_channel_time = jiffies; /* advance to the next channel to be scanned */ local->next_scan_state = SCAN_SET_CHANNEL; } void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; unsigned long next_delay = 0; bool aborted; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_can_run_worker(local)) { aborted = true; goto out_complete; } sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); /* When scanning on-channel, the first-callback means completed. */ if (test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (!sdata || !scan_req) return; if (!local->scanning) { int rc; RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); rc = __ieee80211_start_scan(sdata, scan_req); if (!rc) return; /* need to complete scan in cfg80211 */ rcu_assign_pointer(local->scan_req, scan_req); aborted = true; goto out_complete; } clear_bit(SCAN_BEACON_WAIT, &local->scanning); /* * as long as no delay is required advance immediately * without scheduling a new work */ do { if (!ieee80211_sdata_running(sdata)) { aborted = true; goto out_complete; } if (test_and_clear_bit(SCAN_BEACON_DONE, &local->scanning) && local->next_scan_state == SCAN_DECISION) local->next_scan_state = SCAN_SEND_PROBE; switch (local->next_scan_state) { case SCAN_DECISION: /* if no more bands/channels left, complete scan */ if (local->scan_channel_idx >= scan_req->n_channels) { aborted = false; goto out_complete; } ieee80211_scan_state_decision(local, &next_delay); break; case SCAN_SET_CHANNEL: ieee80211_scan_state_set_channel(local, &next_delay); break; case SCAN_SEND_PROBE: ieee80211_scan_state_send_probe(local, &next_delay); break; case SCAN_SUSPEND: ieee80211_scan_state_suspend(local, &next_delay); break; case SCAN_RESUME: ieee80211_scan_state_resume(local, &next_delay); break; case SCAN_ABORT: aborted = true; goto out_complete; } } while (next_delay == 0); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return; out_complete: __ieee80211_scan_completed(&local->hw, aborted); } int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { lockdep_assert_wiphy(sdata->local->hw.wiphy); return __ieee80211_start_scan(sdata, req); } int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels) { struct ieee80211_local *local = sdata->local; int i, n_ch = 0; enum nl80211_band band; lockdep_assert_wiphy(local->hw.wiphy); /* busy scanning */ if (local->scan_req) return -EBUSY; /* fill internal scan request */ if (!channels) { int max_n; for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || band == NL80211_BAND_6GHZ || band == NL80211_BAND_S1GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; for (i = 0; i < max_n; i++) { struct ieee80211_channel *tmp_ch = &local->hw.wiphy->bands[band]->channels[i]; if (tmp_ch->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED) || !cfg80211_wdev_channel_allowed(&sdata->wdev, tmp_ch)) continue; local->int_scan_req->channels[n_ch] = tmp_ch; n_ch++; } } if (WARN_ON_ONCE(n_ch == 0)) return -EINVAL; local->int_scan_req->n_channels = n_ch; } else { for (i = 0; i < n_channels; i++) { if (channels[i]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED) || !cfg80211_wdev_channel_allowed(&sdata->wdev, channels[i])) continue; local->int_scan_req->channels[n_ch] = channels[i]; n_ch++; } if (n_ch == 0) return -EINVAL; local->int_scan_req->n_channels = n_ch; } local->int_scan_req->ssids = &local->scan_ssid; local->int_scan_req->n_ssids = 1; memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); local->int_scan_req->ssids[0].ssid_len = ssid_len; return __ieee80211_start_scan(sdata, sdata->local->int_scan_req); } void ieee80211_scan_cancel(struct ieee80211_local *local) { /* ensure a new scan cannot be queued */ lockdep_assert_wiphy(local->hw.wiphy); /* * We are canceling software scan, or deferred scan that was not * yet really started (see __ieee80211_start_scan ). * * Regarding hardware scan: * - we can not call __ieee80211_scan_completed() as when * SCAN_HW_SCANNING bit is set this function change * local->hw_scan_req to operate on 5G band, what race with * driver which can use local->hw_scan_req * * - we can not cancel scan_work since driver can schedule it * by ieee80211_scan_completed(..., true) to finish scan * * Hence we only call the cancel_hw_scan() callback, but the low-level * driver is still responsible for calling ieee80211_scan_completed() * after the scan was completed/aborted. */ if (!local->scan_req) return; /* * We have a scan running and the driver already reported completion, * but the worker hasn't run yet or is stuck on the mutex - mark it as * cancelled. */ if (test_bit(SCAN_HW_SCANNING, &local->scanning) && test_bit(SCAN_COMPLETED, &local->scanning)) { set_bit(SCAN_HW_CANCELLED, &local->scanning); return; } if (test_bit(SCAN_HW_SCANNING, &local->scanning)) { /* * Make sure that __ieee80211_scan_completed doesn't trigger a * scan on another band. */ set_bit(SCAN_HW_CANCELLED, &local->scanning); if (local->ops->cancel_hw_scan) drv_cancel_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx))); return; } wiphy_delayed_work_cancel(local->hw.wiphy, &local->scan_work); /* and clean up */ memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); } int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz, num_bands = 0; u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; u32 flags = 0; lockdep_assert_wiphy(local->hw.wiphy); iebufsz = local->scan_ies_len + req->ie_len; if (!local->ops->sched_scan_start) return -EOPNOTSUPP; for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { bands_used |= BIT(i); rate_masks[i] = (u32) -1; num_bands++; } } if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; goto out; } ieee80211_prepare_scan_chandef(&chandef); ret = ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); if (ret < 0) goto error; ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { rcu_assign_pointer(local->sched_scan_sdata, sdata); rcu_assign_pointer(local->sched_scan_req, req); } error: kfree(ie); out: if (ret) { /* Clean in case of failure after HW restart or upon resume. */ RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); } return ret; } int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (rcu_access_pointer(local->sched_scan_sdata)) return -EBUSY; return __ieee80211_request_sched_scan_start(sdata, req); } int ieee80211_request_sched_scan_stop(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sched_scan_sdata; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; /* We don't want to restart sched scan anymore. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata) { ret = drv_sched_scan_stop(local, sched_scan_sdata); if (!ret) RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } return ret; } void ieee80211_sched_scan_results(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_results(local); cfg80211_sched_scan_results(hw->wiphy, 0); } EXPORT_SYMBOL(ieee80211_sched_scan_results); void ieee80211_sched_scan_end(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!rcu_access_pointer(local->sched_scan_sdata)) return; RCU_INIT_POINTER(local->sched_scan_sdata, NULL); /* If sched scan was aborted by the driver. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); } void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, sched_scan_stopped_work); ieee80211_sched_scan_end(local); } void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_stopped(local); /* * this shouldn't really happen, so for simplicity * simply ignore it, and let mac80211 reconfigure * the sched scan later on. */ if (local->in_reconfig) return; wiphy_work_queue(hw->wiphy, &local->sched_scan_stopped_work); } EXPORT_SYMBOL(ieee80211_sched_scan_stopped);
1544 1545 1547 1544 1322 1294 1296 306 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 /* SPDX-License-Identifier: GPL-2.0-only */ /* * fs/kernfs/kernfs-internal.h - kernfs internal header file * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> */ #ifndef __KERNFS_INTERNAL_H #define __KERNFS_INTERNAL_H #include <linux/lockdep.h> #include <linux/fs.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/kernfs.h> #include <linux/fs_context.h> struct kernfs_iattrs { kuid_t ia_uid; kgid_t ia_gid; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; struct simple_xattrs xattrs; atomic_t nr_user_xattrs; atomic_t user_xattr_size; }; struct kernfs_root { /* published fields */ struct kernfs_node *kn; unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ struct idr ino_idr; spinlock_t kernfs_idr_lock; /* root->ino_idr */ u32 last_id_lowbits; u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ struct list_head supers; wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; struct rw_semaphore kernfs_iattr_rwsem; struct rw_semaphore kernfs_supers_rwsem; /* kn->parent and kn->name */ rwlock_t kernfs_rename_lock; struct rcu_head rcu; }; /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ /** * kernfs_root - find out the kernfs_root a kernfs_node belongs to * @kn: kernfs_node of interest * * Return: the kernfs_root @kn belongs to. */ static inline struct kernfs_root *kernfs_root(const struct kernfs_node *kn) { const struct kernfs_node *knp; /* if parent exists, it's always a dir; otherwise, @sd is a dir */ guard(rcu)(); knp = rcu_dereference(kn->__parent); if (knp) kn = knp; return kn->dir.root; } /* * mount.c */ struct kernfs_super_info { struct super_block *sb; /* * The root associated with this super_block. Each super_block is * identified by the root and ns it's associated with. */ struct kernfs_root *root; /* * Each sb is associated with one namespace tag, currently the * network namespace of the task which mounted this kernfs * instance. If multiple tags become necessary, make the following * an array and compare kernfs_node tag against every entry. */ const void *ns; /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) static inline bool kernfs_root_is_locked(const struct kernfs_node *kn) { return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem); } static inline bool kernfs_rename_is_locked(const struct kernfs_node *kn) { return lockdep_is_held(&kernfs_root(kn)->kernfs_rename_lock); } static inline const char *kernfs_rcu_name(const struct kernfs_node *kn) { return rcu_dereference_check(kn->name, kernfs_root_is_locked(kn)); } static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn) { /* * The kernfs_node::__parent remains valid within a RCU section. The kn * can be reparented (and renamed) which changes the entry. This can be * avoided by locking kernfs_root::kernfs_rwsem or * kernfs_root::kernfs_rename_lock. * Both locks can be used to obtain a reference on __parent. Once the * reference count reaches 0 then the node is about to be freed * and can not be renamed (or become a different parent) anymore. */ return rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn) || kernfs_rename_is_locked(kn) || !atomic_read(&kn->count)); } static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) { if (d_really_is_negative(dentry)) return NULL; return d_inode(dentry)->i_private; } static inline void kernfs_set_rev(struct kernfs_node *parent, struct dentry *dentry) { dentry->d_time = parent->dir.rev; } static inline void kernfs_inc_rev(struct kernfs_node *parent) { parent->dir.rev++; } static inline bool kernfs_dir_changed(struct kernfs_node *parent, struct dentry *dentry) { if (parent->dir.rev != dentry->d_time) return true; return false; } extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c */ extern const struct xattr_handler * const kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); void kernfs_put_active(struct kernfs_node *kn); int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); /* * file.c */ extern const struct file_operations kernfs_file_fops; bool kernfs_should_drain_open_files(struct kernfs_node *kn); void kernfs_drain_open_files(struct kernfs_node *kn); /* * symlink.c */ extern const struct inode_operations kernfs_symlink_iops; /* * kernfs locks */ extern struct kernfs_global_locks *kernfs_locks; #endif /* __KERNFS_INTERNAL_H */
4 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_rateest.h> #include <net/netfilter/xt_rateest.h> static bool xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateest_match_info *info = par->matchinfo; struct gnet_stats_rate_est64 sample = {0}; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; gen_estimator_read(&info->est1->rate_est, &sample); if (info->flags & XT_RATEEST_MATCH_DELTA) { bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0; pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0; } else { bps1 = sample.bps; pps1 = sample.pps; } if (info->flags & XT_RATEEST_MATCH_ABS) { bps2 = info->bps2; pps2 = info->pps2; } else { gen_estimator_read(&info->est2->rate_est, &sample); if (info->flags & XT_RATEEST_MATCH_DELTA) { bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0; pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0; } else { bps2 = sample.bps; pps2 = sample.pps; } } switch (info->mode) { case XT_RATEEST_MATCH_LT: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 < bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 < pps2; break; case XT_RATEEST_MATCH_GT: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 > bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 > pps2; break; case XT_RATEEST_MATCH_EQ: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 == bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 == pps2; break; } ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false; return ret; } static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) { struct xt_rateest_match_info *info = par->matchinfo; struct xt_rateest *est1, *est2; int ret = -EINVAL; if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | XT_RATEEST_MATCH_REL)) != 1) goto err1; if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS))) goto err1; switch (info->mode) { case XT_RATEEST_MATCH_EQ: case XT_RATEEST_MATCH_LT: case XT_RATEEST_MATCH_GT: break; default: goto err1; } ret = -ENOENT; est1 = xt_rateest_lookup(par->net, info->name1); if (!est1) goto err1; est2 = NULL; if (info->flags & XT_RATEEST_MATCH_REL) { est2 = xt_rateest_lookup(par->net, info->name2); if (!est2) goto err2; } info->est1 = est1; info->est2 = est2; return 0; err2: xt_rateest_put(par->net, est1); err1: return ret; } static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) { struct xt_rateest_match_info *info = par->matchinfo; xt_rateest_put(par->net, info->est1); if (info->est2) xt_rateest_put(par->net, info->est2); } static struct xt_match xt_rateest_mt_reg __read_mostly = { .name = "rateest", .revision = 0, .family = NFPROTO_UNSPEC, .match = xt_rateest_mt, .checkentry = xt_rateest_mt_checkentry, .destroy = xt_rateest_mt_destroy, .matchsize = sizeof(struct xt_rateest_match_info), .usersize = offsetof(struct xt_rateest_match_info, est1), .me = THIS_MODULE, }; static int __init xt_rateest_mt_init(void) { return xt_register_match(&xt_rateest_mt_reg); } static void __exit xt_rateest_mt_fini(void) { xt_unregister_match(&xt_rateest_mt_reg); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("xtables rate estimator match"); MODULE_ALIAS("ipt_rateest"); MODULE_ALIAS("ip6t_rateest"); module_init(xt_rateest_mt_init); module_exit(xt_rateest_mt_fini);
30 4 26 26 7 7 9 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 // SPDX-License-Identifier: GPL-2.0-only /* * * Generic part shared by ipv4 and ipv6 backends. */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <linux/in.h> #include <net/xfrm.h> static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; struct nft_xfrm { enum nft_xfrm_keys key:8; u8 dreg; u8 dir; u8 spnum; u8 len; }; static int nft_xfrm_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int len = 0; u32 spnum = 0; u8 dir; if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG]) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: break; default: return -EOPNOTSUPP; } priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY])); switch (priv->key) { case NFT_XFRM_KEY_REQID: case NFT_XFRM_KEY_SPI: len = sizeof(u32); break; case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: len = sizeof(struct in_addr); break; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: len = sizeof(struct in6_addr); break; default: return -EINVAL; } dir = nla_get_u8(tb[NFTA_XFRM_DIR]); switch (dir) { case XFRM_POLICY_IN: case XFRM_POLICY_OUT: priv->dir = dir; break; default: return -EINVAL; } if (tb[NFTA_XFRM_SPNUM]) spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM])); if (spnum >= XFRM_MAX_DEPTH) return -ERANGE; priv->spnum = spnum; priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current * state does have a valid address (BEET, TUNNEL). */ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) { switch (k) { case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: if (family == NFPROTO_IPV4) break; return false; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: if (family == NFPROTO_IPV6) break; return false; default: return true; } return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, struct nft_regs *regs, const struct xfrm_state *state) { u32 *dest = &regs->data[priv->dreg]; if (!xfrm_state_addr_ok(priv->key, state->props.family, state->props.mode)) { regs->verdict.code = NFT_BREAK; return; } switch (priv->key) { case NFT_XFRM_KEY_UNSPEC: case __NFT_XFRM_KEY_MAX: WARN_ON_ONCE(1); break; case NFT_XFRM_KEY_DADDR_IP4: *dest = (__force __u32)state->id.daddr.a4; return; case NFT_XFRM_KEY_DADDR_IP6: memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_SADDR_IP4: *dest = (__force __u32)state->props.saddr.a4; return; case NFT_XFRM_KEY_SADDR_IP6: memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_REQID: *dest = state->props.reqid; return; case NFT_XFRM_KEY_SPI: *dest = (__force __u32)state->id.spi; return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct sec_path *sp = skb_sec_path(pkt->skb); const struct xfrm_state *state; if (sp == NULL || sp->len <= priv->spnum) { regs->verdict.code = NFT_BREAK; return; } state = sp->xvec[priv->spnum]; nft_xfrm_state_get_key(priv, regs, state); } static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct dst_entry *dst = skb_dst(pkt->skb); int i; for (i = 0; dst && dst->xfrm; dst = ((const struct xfrm_dst *)dst)->child, i++) { if (i < priv->spnum) continue; nft_xfrm_state_get_key(priv, regs, dst->xfrm); return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_xfrm *priv = nft_expr_priv(expr); switch (priv->dir) { case XFRM_POLICY_IN: nft_xfrm_get_eval_in(priv, regs, pkt); break; case XFRM_POLICY_OUT: nft_xfrm_get_eval_out(priv, regs, pkt); break; default: WARN_ON_ONCE(1); regs->verdict.code = NFT_BREAK; break; } } static int nft_xfrm_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_xfrm *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg)) return -1; if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key))) return -1; if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir)) return -1; if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum))) return -1; return 0; } static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING); break; case XFRM_POLICY_OUT: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING); break; default: WARN_ON_ONCE(1); return -EINVAL; } return nft_chain_validate_hooks(ctx->chain, hooks); } static bool nft_xfrm_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); const struct nft_xfrm *xfrm; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } xfrm = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != xfrm->key || priv->dreg != xfrm->dreg || priv->dir != xfrm->dir || priv->spnum != xfrm->spnum) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static struct nft_expr_type nft_xfrm_type; static const struct nft_expr_ops nft_xfrm_get_ops = { .type = &nft_xfrm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)), .eval = nft_xfrm_get_eval, .init = nft_xfrm_get_init, .dump = nft_xfrm_get_dump, .validate = nft_xfrm_validate, .reduce = nft_xfrm_reduce, }; static struct nft_expr_type nft_xfrm_type __read_mostly = { .name = "xfrm", .ops = &nft_xfrm_get_ops, .policy = nft_xfrm_policy, .maxattr = NFTA_XFRM_MAX, .owner = THIS_MODULE, }; static int __init nft_xfrm_module_init(void) { return nft_register_expr(&nft_xfrm_type); } static void __exit nft_xfrm_module_exit(void) { nft_unregister_expr(&nft_xfrm_type); } module_init(nft_xfrm_module_init); module_exit(nft_xfrm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>"); MODULE_ALIAS_NFT_EXPR("xfrm");
34 33 34 1 1 1 1 3 3 3 3 3 3 3 3 3 3 2 3 3 3 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 // SPDX-License-Identifier: GPL-2.0 /* * Central processing for nfsd. * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched/signal.h> #include <linux/freezer.h> #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> #include <linux/siphash.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/nfslocalio.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" #include "vfs.h" #include "netns.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_SVC atomic_t nfsd_th_cnt = ATOMIC_INIT(0); static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static int nfsd_acl_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_acl_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); #endif static int nfsd_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); /* * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in * nfsctl.c. In particular: * * user_recovery_dirname * user_lease_time * nfsd_versions */ DEFINE_MUTEX(nfsd_mutex); #if IS_ENABLED(CONFIG_NFS_LOCALIO) static const struct svc_version *localio_versions[] = { [1] = &localio_version1, }; #define NFSD_LOCALIO_NRVERS ARRAY_SIZE(localio_versions) #endif /* CONFIG_NFS_LOCALIO */ #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, # endif # if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, # endif }; #define NFSD_ACL_MINVERS 2 #define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[NFSD_MAXVERS+1] = { #if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, #endif [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, #endif }; struct svc_program nfsd_programs[] = { { .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_MAXVERS+1, /* nr of entries in nfsd_version */ .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ .pg_authenticate = svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, }, #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) { .pg_prog = NFS_ACL_PROGRAM, .pg_nvers = NFSD_ACL_NRVERS, .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }, #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ #if IS_ENABLED(CONFIG_NFS_LOCALIO) { .pg_prog = NFS_LOCALIO_PROGRAM, .pg_nvers = NFSD_LOCALIO_NRVERS, .pg_vers = localio_versions, .pg_name = "nfslocalio", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, } #endif /* CONFIG_NFS_LOCALIO */ }; bool nfsd_support_version(int vers) { if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS) return nfsd_version[vers] != NULL; return false; } int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers > NFSD_MAXVERS) return 0; switch(change) { case NFSD_SET: nn->nfsd_versions[vers] = nfsd_support_version(vers); break; case NFSD_CLEAR: nn->nfsd_versions[vers] = false; break; case NFSD_TEST: return nn->nfsd_versions[vers]; case NFSD_AVAIL: return nfsd_support_version(vers); } return 0; } static void nfsd_adjust_nfsd_versions4(struct nfsd_net *nn) { unsigned i; for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { if (nn->nfsd4_minorversions[i]) return; } nfsd_vers(nn, 4, NFSD_CLEAR); } int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION && change != NFSD_AVAIL) return -1; switch(change) { case NFSD_SET: nfsd_vers(nn, 4, NFSD_SET); nn->nfsd4_minorversions[minorversion] = nfsd_vers(nn, 4, NFSD_TEST); break; case NFSD_CLEAR: nn->nfsd4_minorversions[minorversion] = false; nfsd_adjust_nfsd_versions4(nn); break; case NFSD_TEST: return nn->nfsd4_minorversions[minorversion]; case NFSD_AVAIL: return minorversion <= NFSD_SUPPORTED_MINOR_VERSION && nfsd_vers(nn, 4, NFSD_AVAIL); } return 0; } bool nfsd_net_try_get(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); return (nn && percpu_ref_tryget_live(&nn->nfsd_net_ref)); } void nfsd_net_put(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); percpu_ref_put(&nn->nfsd_net_ref); } static void nfsd_net_done(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref); complete(&nn->nfsd_net_confirm_done); } static void nfsd_net_free(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref); complete(&nn->nfsd_net_free_done); } /* * Maximum number of nfsd processes */ #define NFSD_MAXSERVS 8192 int nfsd_nrthreads(struct net *net) { int rv = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) rv = nn->nfsd_serv->sv_nrthreads; mutex_unlock(&nfsd_mutex); return rv; } static int nfsd_init_socks(struct net *net, const struct cred *cred) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!list_empty(&nn->nfsd_serv->sv_permsocks)) return 0; error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; return 0; } static int nfsd_users = 0; static int nfsd_startup_generic(void) { int ret; if (nfsd_users++) return 0; ret = nfsd_file_cache_init(); if (ret) goto dec_users; ret = nfs4_state_start(); if (ret) goto out_file_cache; return 0; out_file_cache: nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; } static void nfsd_shutdown_generic(void) { if (--nfsd_users) return; nfs4_state_shutdown(); nfsd_file_cache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) { return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } /** * nfsd_copy_write_verifier - Atomically copy a write verifier * @verf: buffer in which to receive the verifier cookie * @nn: NFS net namespace * * This function provides a wait-free mechanism for copying the * namespace's write verifier without tearing it. */ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { unsigned int seq; do { seq = read_seqbegin(&nn->writeverf_lock); memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); } while (read_seqretry(&nn->writeverf_lock, seq)); } static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) { struct timespec64 now; u64 verf; /* * Because the time value is hashed, y2038 time_t overflow * is irrelevant in this usage. */ ktime_get_raw_ts64(&now); verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); } /** * nfsd_reset_write_verifier - Generate a new write verifier * @nn: NFS net namespace * * This function updates the ->writeverf field of @nn. This field * contains an opaque cookie that, according to Section 18.32.3 of * RFC 8881, "the client can use to determine whether a server has * changed instance state (e.g., server restart) between a call to * WRITE and a subsequent call to either WRITE or COMMIT. This * cookie MUST be unchanged during a single instance of the NFSv4.1 * server and MUST be unique between instances of the NFSv4.1 * server." */ void nfsd_reset_write_verifier(struct nfsd_net *nn) { write_seqlock(&nn->writeverf_lock); nfsd_reset_write_verifier_locked(nn); write_sequnlock(&nn->writeverf_lock); } /* * Crank up a set of per-namespace resources for a new NFSD instance, * including lockd, a duplicate reply cache, an open file cache * instance, and a cache of NFSv4 state objects. */ static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; if (nn->nfsd_net_up) return 0; ret = nfsd_startup_generic(); if (ret) return ret; ret = nfsd_init_socks(net, cred); if (ret) goto out_socks; if (nfsd_needs_lockd(nn) && !nn->lockd_up) { ret = lockd_up(net, cred); if (ret) goto out_socks; nn->lockd_up = true; } ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; ret = nfsd_reply_cache_init(nn); if (ret) goto out_filecache; #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_init_umount_work(nn); #endif ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; nn->nfsd_net_up = true; return 0; out_reply_cache: nfsd_reply_cache_shutdown(nn); out_filecache: nfsd_file_cache_shutdown_net(net); out_lockd: if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } out_socks: nfsd_shutdown_generic(); return ret; } static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nn->nfsd_net_up) return; percpu_ref_kill_and_confirm(&nn->nfsd_net_ref, nfsd_net_done); wait_for_completion(&nn->nfsd_net_confirm_done); nfsd_export_flush(net); nfs4_state_shutdown_net(net); nfsd_reply_cache_shutdown(nn); nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } wait_for_completion(&nn->nfsd_net_free_done); percpu_ref_exit(&nn->nfsd_net_ref); nn->nfsd_net_up = false; nfsd_shutdown_generic(); } static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inetaddr_notifier = { .notifier_call = nfsd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nfsd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct net_device *dev = ifa->idev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inet6addr_notifier = { .notifier_call = nfsd_inet6addr_event, }; #endif /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); /** * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace * @net: network namespace the NFS service is associated with */ void nfsd_destroy_serv(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; lockdep_assert_held(&nfsd_mutex); spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = NULL; spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } /* * write_ports can create the server without actually starting * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } void nfsd_reset_versions(struct nfsd_net *nn) { int i; for (i = 0; i <= NFSD_MAXVERS; i++) if (nfsd_vers(nn, i, NFSD_TEST)) return; for (i = 0; i <= NFSD_MAXVERS; i++) if (i != 4) nfsd_vers(nn, i, NFSD_SET); else { int minor = 0; while (nfsd_minorversion(nn, minor, NFSD_SET) >= 0) minor++; } } static int nfsd_get_default_max_blksize(void) { struct sysinfo i; unsigned long long target; unsigned long ret; si_meminfo(&i); target = (i.totalram - i.totalhigh) << PAGE_SHIFT; /* * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig * machines, but only uses 32K on 128M machines. Bottom out at * 8K on 32M and smaller. Of course, this is only a default. */ target >>= 12; ret = NFSSVC_DEFBLKSIZE; while (ret > target && ret >= 8*1024*2) ret /= 2; return ret; } void nfsd_shutdown_threads(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; mutex_lock(&nfsd_mutex); serv = nn->nfsd_serv; if (serv == NULL) { mutex_unlock(&nfsd_mutex); return; } /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); nfsd_destroy_serv(net); mutex_unlock(&nfsd_mutex); } struct svc_rqst *nfsd_current_rqst(void) { if (kthread_func(current) == nfsd) return kthread_data(current); return NULL; } int nfsd_create_serv(struct net *net) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nn->nfsd_serv) return 0; error = percpu_ref_init(&nn->nfsd_net_ref, nfsd_net_free, 0, GFP_KERNEL); if (error) return error; init_completion(&nn->nfsd_net_free_done); init_completion(&nn->nfsd_net_confirm_done); if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); serv = svc_create_pooled(nfsd_programs, ARRAY_SIZE(nfsd_programs), &nn->nfsd_svcstats, nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; error = svc_bind(serv, net); if (error < 0) { svc_destroy(&serv); return error; } spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = serv; spin_unlock(&nfsd_notifier_lock); /* check if the notifier is already set */ if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { register_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } nfsd_reset_write_verifier(nn); return 0; } int nfsd_nrpools(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->nfsd_serv == NULL) return 0; else return nn->nfsd_serv->sv_nrpools; } int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; int i; if (serv) for (i = 0; i < serv->sv_nrpools && i < n; i++) nthreads[i] = serv->sv_pools[i].sp_nrthreads; return 0; } /** * nfsd_set_nrthreads - set the number of running threads in the net's service * @n: number of array members in @nthreads * @nthreads: array of thread counts for each pool * @net: network namespace to operate within * * This function alters the number of running threads for the given network * namespace in each pool. If passed an array longer then the number of pools * the extra pool settings are ignored. If passed an array shorter than the * number of pools, the missing values are interpreted as 0's. * * Returns 0 on success or a negative errno on error. */ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; int tot = 0; int err = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); lockdep_assert_held(&nfsd_mutex); if (nn->nfsd_serv == NULL || n <= 0) return 0; /* * Special case: When n == 1, pass in NULL for the pool, so that the * change is distributed equally among them. */ if (n == 1) return svc_set_num_threads(nn->nfsd_serv, NULL, nthreads[0]); if (n > nn->nfsd_serv->sv_nrpools) n = nn->nfsd_serv->sv_nrpools; /* enforce a global maximum number of threads */ tot = 0; for (i = 0; i < n; i++) { nthreads[i] = min(nthreads[i], NFSD_MAXSERVS); tot += nthreads[i]; } if (tot > NFSD_MAXSERVS) { /* total too large: scale down requested numbers */ for (i = 0; i < n && tot > 0; i++) { int new = nthreads[i] * NFSD_MAXSERVS / tot; tot -= (nthreads[i] - new); nthreads[i] = new; } for (i = 0; i < n && tot > 0; i++) { nthreads[i]--; tot--; } } /* apply the new numbers */ for (i = 0; i < n; i++) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], nthreads[i]); if (err) goto out; } /* Anything undefined in array is considered to be 0 */ for (i = n; i < nn->nfsd_serv->sv_nrpools; ++i) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], 0); if (err) goto out; } out: return err; } /** * nfsd_svc: start up or shut down the nfsd server * @n: number of array members in @nthreads * @nthreads: array of thread counts for each pool * @net: network namespace to operate within * @cred: credentials to use for xprt creation * @scope: server scope value (defaults to nodename) * * Adjust the number of threads in each pool and return the new * total number of threads in the service. */ int nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const char *scope) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; lockdep_assert_held(&nfsd_mutex); dprintk("nfsd: creating service\n"); strscpy(nn->nfsd_name, scope ? scope : utsname()->nodename, sizeof(nn->nfsd_name)); error = nfsd_create_serv(net); if (error) goto out; serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); if (error) goto out_put; error = nfsd_set_nrthreads(n, nthreads, net); if (error) goto out_put; error = serv->sv_nrthreads; out_put: if (serv->sv_nrthreads == 0) nfsd_destroy_serv(net); out: return error; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static bool nfsd_support_acl_version(int vers) { if (vers >= NFSD_ACL_MINVERS && vers < NFSD_ACL_NRVERS) return nfsd_acl_version[vers] != NULL; return false; } static int nfsd_acl_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { if (!nfsd_support_acl_version(version) || !nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) return 0; return svc_generic_rpcbind_set(net, progp, version, family, proto, port); } static __be32 nfsd_acl_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); int i; if (likely(nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) return svc_generic_init_request(rqstp, progp, ret); ret->mismatch.lovers = NFSD_ACL_NRVERS; for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { if (nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.lovers = i; break; } } if (ret->mismatch.lovers == NFSD_ACL_NRVERS) return rpc_prog_unavail; ret->mismatch.hivers = NFSD_ACL_MINVERS; for (i = NFSD_ACL_NRVERS - 1; i >= NFSD_ACL_MINVERS; i--) { if (nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.hivers = i; break; } } return rpc_prog_mismatch; } #endif static int nfsd_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { if (!nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) return 0; return svc_generic_rpcbind_set(net, progp, version, family, proto, port); } static __be32 nfsd_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); int i; if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) return svc_generic_init_request(rqstp, progp, ret); ret->mismatch.lovers = NFSD_MAXVERS + 1; for (i = NFSD_MINVERS; i <= NFSD_MAXVERS; i++) { if (nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.lovers = i; break; } } if (ret->mismatch.lovers > NFSD_MAXVERS) return rpc_prog_unavail; ret->mismatch.hivers = NFSD_MINVERS; for (i = NFSD_MAXVERS; i >= NFSD_MINVERS; i--) { if (nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.hivers = i; break; } } return rpc_prog_mismatch; } /* * This is the NFS server kernel thread */ static int nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); /* At this point, the thread shares current->fs * with the init process. We need to create files with the * umask as defined by the client instead of init's umask. */ svc_thread_init_status(rqstp, unshare_fs_struct()); current->fs->umask = 0; atomic_inc(&nfsd_th_cnt); set_freezable(); /* * The main request loop */ while (!svc_thread_should_stop(rqstp)) { svc_recv(rqstp); nfsd_file_net_dispose(nn); } atomic_dec(&nfsd_th_cnt); /* Release the thread */ svc_exit_thread(rqstp); return 0; } /** * nfsd_dispatch - Process an NFS or NFSACL or LOCALIO Request * @rqstp: incoming request * * This RPC dispatcher integrates the NFS server's duplicate reply cache. * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; unsigned int start, len; __be32 *nfs_reply; /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ rqstp->rq_cachetype = proc->pc_cachetype; /* * ->pc_decode advances the argument stream past the NFS * Call header, so grab the header's starting location and * size now for the call to nfsd_cache_lookup(). */ start = xdr_stream_pos(&rqstp->rq_arg_stream); len = xdr_stream_remaining(&rqstp->rq_arg_stream); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; /* * Release rq_status_counter setting it to an odd value after the rpc * request has been properly parsed. rq_status_counter is used to * notify the consumers if the rqstp fields are stable * (rq_status_counter is odd) or not meaningful (rq_status_counter * is even). */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); rp = NULL; switch (nfsd_cache_lookup(rqstp, start, len, &rp)) { case RC_DOIT: break; case RC_REPLY: goto out_cached_reply; case RC_DROPIT: goto out_dropit; } nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; /* * Release rq_status_counter setting it to an even value after the rpc * request has been properly processed. */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); out_cached_reply: return 1; out_decode_err: trace_nfsd_garbage_args_err(rqstp); *statp = rpc_garbage_args; return 1; out_update_drop: nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: trace_nfsd_cant_encode_err(rqstp); nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } /** * nfssvc_decode_voidarg - Decode void arguments * @rqstp: Server RPC transaction context * @xdr: XDR stream positioned at arguments to decode * * Return values: * %false: Arguments were not valid * %true: Decoding was successful */ bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, struct xdr_stream *xdr) { return true; } /** * nfssvc_encode_voidres - Encode void results * @rqstp: Server RPC transaction context * @xdr: XDR stream into which to encode results * * Return values: * %false: Local error while encoding * %true: Encoding was successful */ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { return true; }
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 // SPDX-License-Identifier: GPL-2.0 /* * NVMe over Fabrics RDMA host code. * Copyright (c) 2015-2016 HGST, a Western Digital Company. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <rdma/mr_pool.h> #include <linux/err.h> #include <linux/string.h> #include <linux/atomic.h> #include <linux/blk-mq.h> #include <linux/blk-integrity.h> #include <linux/types.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/scatterlist.h> #include <linux/nvme.h> #include <linux/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <linux/nvme-rdma.h> #include "nvme.h" #include "fabrics.h" #define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */ #define NVME_RDMA_MAX_SEGMENTS 256 #define NVME_RDMA_MAX_INLINE_SEGMENTS 4 #define NVME_RDMA_DATA_SGL_SIZE \ (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT) #define NVME_RDMA_METADATA_SGL_SIZE \ (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT) struct nvme_rdma_device { struct ib_device *dev; struct ib_pd *pd; struct kref ref; struct list_head entry; unsigned int num_inline_segments; }; struct nvme_rdma_qe { struct ib_cqe cqe; void *data; u64 dma; }; struct nvme_rdma_sgl { int nents; struct sg_table sg_table; }; struct nvme_rdma_queue; struct nvme_rdma_request { struct nvme_request req; struct ib_mr *mr; struct nvme_rdma_qe sqe; union nvme_result result; __le16 status; refcount_t ref; struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; u32 num_sge; struct ib_reg_wr reg_wr; struct ib_cqe reg_cqe; struct nvme_rdma_queue *queue; struct nvme_rdma_sgl data_sgl; struct nvme_rdma_sgl *metadata_sgl; bool use_sig_mr; }; enum nvme_rdma_queue_flags { NVME_RDMA_Q_ALLOCATED = 0, NVME_RDMA_Q_LIVE = 1, NVME_RDMA_Q_TR_READY = 2, }; struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; struct nvme_rdma_device *device; struct ib_cq *ib_cq; struct ib_qp *qp; unsigned long flags; struct rdma_cm_id *cm_id; int cm_error; struct completion cm_done; bool pi_support; int cq_size; struct mutex queue_lock; }; struct nvme_rdma_ctrl { /* read only in the hot path */ struct nvme_rdma_queue *queues; /* other member variables */ struct blk_mq_tag_set tag_set; struct work_struct err_work; struct nvme_rdma_qe async_event_sqe; struct delayed_work reconnect_work; struct list_head list; struct blk_mq_tag_set admin_tag_set; struct nvme_rdma_device *device; u32 max_fr_pages; struct sockaddr_storage addr; struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; bool use_inline_data; u32 io_queues[HCTX_MAX_TYPES]; }; static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) { return container_of(ctrl, struct nvme_rdma_ctrl, ctrl); } static LIST_HEAD(device_list); static DEFINE_MUTEX(device_list_mutex); static LIST_HEAD(nvme_rdma_ctrl_list); static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); /* * Disabling this option makes small I/O goes faster, but is fundamentally * unsafe. With it turned off we will have to register a global rkey that * allows read and write access to all physical memory. */ static bool register_always = true; module_param(register_always, bool, 0444); MODULE_PARM_DESC(register_always, "Use memory registration even for contiguous memory regions"); static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void nvme_rdma_complete_rq(struct request *rq); static const struct blk_mq_ops nvme_rdma_mq_ops; static const struct blk_mq_ops nvme_rdma_admin_mq_ops; static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) { return queue - queue->ctrl->queues; } static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue) { return nvme_rdma_queue_idx(queue) > queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] + queue->ctrl->io_queues[HCTX_TYPE_READ]; } static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) { return queue->cmnd_capsule_len - sizeof(struct nvme_command); } static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, size_t capsule_size, enum dma_data_direction dir) { ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir); kfree(qe->data); } static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, size_t capsule_size, enum dma_data_direction dir) { qe->data = kzalloc(capsule_size, GFP_KERNEL); if (!qe->data) return -ENOMEM; qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir); if (ib_dma_mapping_error(ibdev, qe->dma)) { kfree(qe->data); qe->data = NULL; return -ENOMEM; } return 0; } static void nvme_rdma_free_ring(struct ib_device *ibdev, struct nvme_rdma_qe *ring, size_t ib_queue_size, size_t capsule_size, enum dma_data_direction dir) { int i; for (i = 0; i < ib_queue_size; i++) nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir); kfree(ring); } static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, size_t ib_queue_size, size_t capsule_size, enum dma_data_direction dir) { struct nvme_rdma_qe *ring; int i; ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL); if (!ring) return NULL; /* * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue * lifetime. It's safe, since any change in the underlying RDMA device * will issue error recovery and queue re-creation. */ for (i = 0; i < ib_queue_size; i++) { if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) goto out_free_ring; } return ring; out_free_ring: nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir); return NULL; } static void nvme_rdma_qp_event(struct ib_event *event, void *context) { pr_debug("QP event %s (%d)\n", ib_event_msg(event->event), event->event); } static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) { int ret; ret = wait_for_completion_interruptible(&queue->cm_done); if (ret) return ret; WARN_ON_ONCE(queue->cm_error > 0); return queue->cm_error; } static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) { struct nvme_rdma_device *dev = queue->device; struct ib_qp_init_attr init_attr; int ret; memset(&init_attr, 0, sizeof(init_attr)); init_attr.event_handler = nvme_rdma_qp_event; /* +1 for drain */ init_attr.cap.max_send_wr = factor * queue->queue_size + 1; /* +1 for drain */ init_attr.cap.max_recv_wr = queue->queue_size + 1; init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1 + dev->num_inline_segments; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = queue->ib_cq; init_attr.recv_cq = queue->ib_cq; if (queue->pi_support) init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; init_attr.qp_context = queue; ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); queue->qp = queue->cm_id->qp; return ret; } static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); kfree(req->sqe.data); } static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data); struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; nvme_req(rq)->ctrl = &ctrl->ctrl; req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL); if (!req->sqe.data) return -ENOMEM; /* metadata nvme_rdma_sgl struct is located after command's data SGL */ if (queue->pi_support) req->metadata_sgl = (void *)nvme_req(rq) + sizeof(struct nvme_rdma_request) + NVME_RDMA_DATA_SGL_SIZE; req->queue = queue; nvme_req(rq)->cmd = req->sqe.data; return 0; } static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data); struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); hctx->driver_data = queue; return 0; } static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data); struct nvme_rdma_queue *queue = &ctrl->queues[0]; BUG_ON(hctx_idx != 0); hctx->driver_data = queue; return 0; } static void nvme_rdma_free_dev(struct kref *ref) { struct nvme_rdma_device *ndev = container_of(ref, struct nvme_rdma_device, ref); mutex_lock(&device_list_mutex); list_del(&ndev->entry); mutex_unlock(&device_list_mutex); ib_dealloc_pd(ndev->pd); kfree(ndev); } static void nvme_rdma_dev_put(struct nvme_rdma_device *dev) { kref_put(&dev->ref, nvme_rdma_free_dev); } static int nvme_rdma_dev_get(struct nvme_rdma_device *dev) { return kref_get_unless_zero(&dev->ref); } static struct nvme_rdma_device * nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) { struct nvme_rdma_device *ndev; mutex_lock(&device_list_mutex); list_for_each_entry(ndev, &device_list, entry) { if (ndev->dev->node_guid == cm_id->device->node_guid && nvme_rdma_dev_get(ndev)) goto out_unlock; } ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); if (!ndev) goto out_err; ndev->dev = cm_id->device; kref_init(&ndev->ref); ndev->pd = ib_alloc_pd(ndev->dev, register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY); if (IS_ERR(ndev->pd)) goto out_free_dev; if (!(ndev->dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { dev_err(&ndev->dev->dev, "Memory registrations not supported.\n"); goto out_free_pd; } ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS, ndev->dev->attrs.max_send_sge - 1); list_add(&ndev->entry, &device_list); out_unlock: mutex_unlock(&device_list_mutex); return ndev; out_free_pd: ib_dealloc_pd(ndev->pd); out_free_dev: kfree(ndev); out_err: mutex_unlock(&device_list_mutex); return NULL; } static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue) { if (nvme_rdma_poll_queue(queue)) ib_free_cq(queue->ib_cq); else ib_cq_pool_put(queue->ib_cq, queue->cq_size); } static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { struct nvme_rdma_device *dev; struct ib_device *ibdev; if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags)) return; dev = queue->device; ibdev = dev->dev; if (queue->pi_support) ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs); ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); /* * The cm_id object might have been destroyed during RDMA connection * establishment error flow to avoid getting other cma events, thus * the destruction of the QP shouldn't use rdma_cm API. */ ib_destroy_qp(queue->qp); nvme_rdma_free_cq(queue); nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); nvme_rdma_dev_put(dev); } static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support) { u32 max_page_list_len; if (pi_support) max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len; else max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len; return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1); } static int nvme_rdma_create_cq(struct ib_device *ibdev, struct nvme_rdma_queue *queue) { int ret, comp_vector, idx = nvme_rdma_queue_idx(queue); /* * Spread I/O queues completion vectors according their queue index. * Admin queues can always go on completion vector 0. */ comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; /* Polling queues need direct cq polling context */ if (nvme_rdma_poll_queue(queue)) queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size, comp_vector, IB_POLL_DIRECT); else queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size, comp_vector, IB_POLL_SOFTIRQ); if (IS_ERR(queue->ib_cq)) { ret = PTR_ERR(queue->ib_cq); return ret; } return 0; } static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ int ret, pages_per_mr; queue->device = nvme_rdma_find_get_device(queue->cm_id); if (!queue->device) { dev_err(queue->cm_id->device->dev.parent, "no client data found!\n"); return -ECONNREFUSED; } ibdev = queue->device->dev; /* +1 for ib_drain_qp */ queue->cq_size = cq_factor * queue->queue_size + 1; ret = nvme_rdma_create_cq(ibdev, queue); if (ret) goto out_put_dev; ret = nvme_rdma_create_qp(queue, send_wr_factor); if (ret) goto out_destroy_ib_cq; queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); if (!queue->rsp_ring) { ret = -ENOMEM; goto out_destroy_qp; } /* * Currently we don't use SG_GAPS MR's so if the first entry is * misaligned we'll end up using two entries for a single data page, * so one additional entry is required. */ pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1; ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, queue->queue_size, IB_MR_TYPE_MEM_REG, pages_per_mr, 0); if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_ring; } if (queue->pi_support) { ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs, queue->queue_size, IB_MR_TYPE_INTEGRITY, pages_per_mr, pages_per_mr); if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize PI MR pool sized %d for QID %d\n", queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_mr_pool; } } set_bit(NVME_RDMA_Q_TR_READY, &queue->flags); return 0; out_destroy_mr_pool: ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); out_destroy_ring: nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); out_destroy_qp: rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: nvme_rdma_free_cq(queue); out_put_dev: nvme_rdma_dev_put(queue->device); return ret; } static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, int idx, size_t queue_size) { struct nvme_rdma_queue *queue; struct sockaddr *src_addr = NULL; int ret; queue = &ctrl->queues[idx]; mutex_init(&queue->queue_lock); queue->ctrl = ctrl; if (idx && ctrl->ctrl.max_integrity_segments) queue->pi_support = true; else queue->pi_support = false; init_completion(&queue->cm_done); if (idx > 0) queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; else queue->cmnd_capsule_len = sizeof(struct nvme_command); queue->queue_size = queue_size; queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(queue->cm_id)) { dev_info(ctrl->ctrl.device, "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id)); ret = PTR_ERR(queue->cm_id); goto out_destroy_mutex; } if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) src_addr = (struct sockaddr *)&ctrl->src_addr; queue->cm_error = -ETIMEDOUT; ret = rdma_resolve_addr(queue->cm_id, src_addr, (struct sockaddr *)&ctrl->addr, NVME_RDMA_CM_TIMEOUT_MS); if (ret) { dev_info(ctrl->ctrl.device, "rdma_resolve_addr failed (%d).\n", ret); goto out_destroy_cm_id; } ret = nvme_rdma_wait_for_cm(queue); if (ret) { dev_info(ctrl->ctrl.device, "rdma connection establishment failed (%d)\n", ret); goto out_destroy_cm_id; } set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags); return 0; out_destroy_cm_id: rdma_destroy_id(queue->cm_id); nvme_rdma_destroy_queue_ib(queue); out_destroy_mutex: mutex_destroy(&queue->queue_lock); return ret; } static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) { rdma_disconnect(queue->cm_id); ib_drain_qp(queue->qp); } static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) { if (!test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) return; mutex_lock(&queue->queue_lock); if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) __nvme_rdma_stop_queue(queue); mutex_unlock(&queue->queue_lock); } static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) { if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) return; rdma_destroy_id(queue->cm_id); nvme_rdma_destroy_queue_ib(queue); mutex_destroy(&queue->queue_lock); } static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) { int i; for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_rdma_free_queue(&ctrl->queues[i]); } static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) { int i; for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_rdma_stop_queue(&ctrl->queues[i]); } static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) { struct nvme_rdma_queue *queue = &ctrl->queues[idx]; int ret; if (idx) ret = nvmf_connect_io_queue(&ctrl->ctrl, idx); else ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (!ret) { set_bit(NVME_RDMA_Q_LIVE, &queue->flags); } else { if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) __nvme_rdma_stop_queue(queue); dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); } return ret; } static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl, int first, int last) { int i, ret = 0; for (i = first; i < last; i++) { ret = nvme_rdma_start_queue(ctrl, i); if (ret) goto out_stop_queues; } return 0; out_stop_queues: for (i--; i >= first; i--) nvme_rdma_stop_queue(&ctrl->queues[i]); return ret; } static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; unsigned int nr_io_queues; int i, ret; nr_io_queues = nvmf_nr_io_queues(opts); ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) return ret; if (nr_io_queues == 0) { dev_err(ctrl->ctrl.device, "unable to set any I/O queues\n"); return -ENOMEM; } ctrl->ctrl.queue_count = nr_io_queues + 1; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); nvmf_set_io_queues(opts, nr_io_queues, ctrl->io_queues); for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_rdma_alloc_queue(ctrl, i, ctrl->ctrl.sqsize + 1); if (ret) goto out_free_queues; } return 0; out_free_queues: for (i--; i >= 1; i--) nvme_rdma_free_queue(&ctrl->queues[i]); return ret; } static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl) { unsigned int cmd_size = sizeof(struct nvme_rdma_request) + NVME_RDMA_DATA_SGL_SIZE; if (ctrl->max_integrity_segments) cmd_size += sizeof(struct nvme_rdma_sgl) + NVME_RDMA_METADATA_SGL_SIZE; return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set, &nvme_rdma_mq_ops, ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, cmd_size); } static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) { if (ctrl->async_event_sqe.data) { cancel_work_sync(&ctrl->ctrl.async_event_work); nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); ctrl->async_event_sqe.data = NULL; } nvme_rdma_free_queue(&ctrl->queues[0]); } static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, bool new) { bool pi_capable = false; int error; error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); if (error) return error; ctrl->device = ctrl->queues[0].device; ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev); /* T10-PI support */ if (ctrl->device->dev->attrs.kernel_cap_flags & IBK_INTEGRITY_HANDOVER) pi_capable = true; ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, pi_capable); /* * Bind the async event SQE DMA mapping to the admin queue lifetime. * It's safe, since any change in the underlying RDMA device will issue * error recovery and queue re-creation. */ error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (error) goto out_free_queue; if (new) { error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops, sizeof(struct nvme_rdma_request) + NVME_RDMA_DATA_SGL_SIZE); if (error) goto out_free_async_qe; } error = nvme_rdma_start_queue(ctrl, 0); if (error) goto out_remove_admin_tag_set; error = nvme_enable_ctrl(&ctrl->ctrl); if (error) goto out_stop_queue; ctrl->ctrl.max_segments = ctrl->max_fr_pages; ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9); if (pi_capable) ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages; else ctrl->ctrl.max_integrity_segments = 0; nvme_unquiesce_admin_queue(&ctrl->ctrl); error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) goto out_quiesce_queue; return 0; out_quiesce_queue: nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); out_remove_admin_tag_set: if (new) nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_async_qe: if (ctrl->async_event_sqe.data) { nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); ctrl->async_event_sqe.data = NULL; } out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; } static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) { int ret, nr_queues; ret = nvme_rdma_alloc_io_queues(ctrl); if (ret) return ret; if (new) { ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl); if (ret) goto out_free_io_queues; } /* * Only start IO queues for which we have allocated the tagset * and limited it to the available queues. On reconnects, the * queue number might have changed. */ nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count); ret = nvme_rdma_start_io_queues(ctrl, 1, nr_queues); if (ret) goto out_cleanup_tagset; if (!new) { nvme_start_freeze(&ctrl->ctrl); nvme_unquiesce_io_queues(&ctrl->ctrl); if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to * be stuck. Fail the controller initialization just * to be safe. */ ret = -ENODEV; nvme_unfreeze(&ctrl->ctrl); goto out_wait_freeze_timed_out; } blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, ctrl->ctrl.queue_count - 1); nvme_unfreeze(&ctrl->ctrl); } /* * If the number of queues has increased (reconnect case) * start all new queues now. */ ret = nvme_rdma_start_io_queues(ctrl, nr_queues, ctrl->tag_set.nr_hw_queues + 1); if (ret) goto out_wait_freeze_timed_out; return 0; out_wait_freeze_timed_out: nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); out_cleanup_tagset: nvme_cancel_tagset(&ctrl->ctrl); if (new) nvme_remove_io_tag_set(&ctrl->ctrl); out_free_io_queues: nvme_rdma_free_io_queues(ctrl); return ret; } static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); if (remove) { nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); } nvme_rdma_destroy_admin_queue(ctrl); } static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { if (ctrl->ctrl.queue_count > 1) { nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); if (remove) { nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_remove_io_tag_set(&ctrl->ctrl); } nvme_rdma_free_io_queues(ctrl); } } static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); flush_work(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->reconnect_work); } static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); if (list_empty(&ctrl->list)) goto free_ctrl; mutex_lock(&nvme_rdma_ctrl_mutex); list_del(&ctrl->list); mutex_unlock(&nvme_rdma_ctrl_mutex); nvmf_free_options(nctrl->opts); free_ctrl: kfree(ctrl->queues); kfree(ctrl); } static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl, int status) { enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); /* If we are resetting/deleting then do nothing */ if (state != NVME_CTRL_CONNECTING) { WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE); return; } if (nvmf_should_reconnect(&ctrl->ctrl, status)) { dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n", ctrl->ctrl.opts->reconnect_delay); queue_delayed_work(nvme_wq, &ctrl->reconnect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { nvme_delete_ctrl(&ctrl->ctrl); } } static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) { int ret; bool changed; u16 max_queue_size; ret = nvme_rdma_configure_admin_queue(ctrl, new); if (ret) return ret; if (ctrl->ctrl.icdoff) { ret = -EOPNOTSUPP; dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); goto destroy_admin; } if (!(ctrl->ctrl.sgls & NVME_CTRL_SGLS_KSDBDS)) { ret = -EOPNOTSUPP; dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not supported!\n"); goto destroy_admin; } if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) { dev_warn(ctrl->ctrl.device, "queue_size %zu > ctrl sqsize %u, clamping down\n", ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); } if (ctrl->ctrl.max_integrity_segments) max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE; else max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; if (ctrl->ctrl.sqsize + 1 > max_queue_size) { dev_warn(ctrl->ctrl.device, "ctrl sqsize %u > max queue size %u, clamping down\n", ctrl->ctrl.sqsize + 1, max_queue_size); ctrl->ctrl.sqsize = max_queue_size - 1; } if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { dev_warn(ctrl->ctrl.device, "sqsize %u > ctrl maxcmd %u, clamping down\n", ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; } if (ctrl->ctrl.sgls & NVME_CTRL_SGLS_SAOS) ctrl->use_inline_data = true; if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_configure_io_queues(ctrl, new); if (ret) goto destroy_admin; } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { /* * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); WARN_ON_ONCE(state != NVME_CTRL_DELETING && state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } nvme_start_ctrl(&ctrl->ctrl); return 0; destroy_io: if (ctrl->ctrl.queue_count > 1) { nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); if (new) nvme_remove_io_tag_set(&ctrl->ctrl); nvme_rdma_free_io_queues(ctrl); } destroy_admin: nvme_stop_keep_alive(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, new); return ret; } static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), struct nvme_rdma_ctrl, reconnect_work); int ret; ++ctrl->ctrl.nr_reconnects; ret = nvme_rdma_setup_ctrl(ctrl, false); if (ret) goto requeue; dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", ctrl->ctrl.nr_reconnects); ctrl->ctrl.nr_reconnects = 0; return; requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d/%d\n", ctrl->ctrl.nr_reconnects, ctrl->ctrl.opts->max_reconnects); nvme_rdma_reconnect_or_remove(ctrl, ret); } static void nvme_rdma_error_recovery_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_auth_stop(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); WARN_ON_ONCE(state != NVME_CTRL_DELETING && state != NVME_CTRL_DELETING_NOIO); return; } nvme_rdma_reconnect_or_remove(ctrl, 0); } static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) { if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; dev_warn(ctrl->ctrl.device, "starting error recovery\n"); queue_work(nvme_reset_wq, &ctrl->err_work); } static void nvme_rdma_end_request(struct nvme_rdma_request *req) { struct request *rq = blk_mq_rq_from_pdu(req); if (!refcount_dec_and_test(&req->ref)) return; if (!nvme_try_complete_req(rq, req->status, req->result)) nvme_rdma_complete_rq(rq); } static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, const char *op) { struct nvme_rdma_queue *queue = wc->qp->qp_context; struct nvme_rdma_ctrl *ctrl = queue->ctrl; if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE) dev_info(ctrl->ctrl.device, "%s for CQE 0x%p failed with status %s (%d)\n", op, wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); nvme_rdma_error_recovery(ctrl); } static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) { if (unlikely(wc->status != IB_WC_SUCCESS)) nvme_rdma_wr_error(cq, wc, "MEMREG"); } static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_request *req = container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); if (unlikely(wc->status != IB_WC_SUCCESS)) nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); else nvme_rdma_end_request(req); } static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, struct nvme_rdma_request *req) { struct ib_send_wr wr = { .opcode = IB_WR_LOCAL_INV, .next = NULL, .num_sge = 0, .send_flags = IB_SEND_SIGNALED, .ex.invalidate_rkey = req->mr->rkey, }; req->reg_cqe.done = nvme_rdma_inv_rkey_done; wr.wr_cqe = &req->reg_cqe; return ib_post_send(queue->qp, &wr, NULL); } static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); if (blk_integrity_rq(rq)) { ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, req->metadata_sgl->nents, rq_dma_dir(rq)); sg_free_table_chained(&req->metadata_sgl->sg_table, NVME_INLINE_METADATA_SG_CNT); } ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); } static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; struct list_head *pool = &queue->qp->rdma_mrs; if (!blk_rq_nr_phys_segments(rq)) return; if (req->use_sig_mr) pool = &queue->qp->sig_mrs; if (req->mr) { ib_mr_pool_put(queue->qp, pool, req->mr); req->mr = NULL; } nvme_rdma_dma_unmap_req(ibdev, rq); } static int nvme_rdma_set_sg_null(struct nvme_command *c) { struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; sg->addr = 0; put_unaligned_le24(0, sg->length); put_unaligned_le32(0, sg->key); sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; return 0; } static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, struct nvme_rdma_request *req, struct nvme_command *c, int count) { struct nvme_sgl_desc *sg = &c->common.dptr.sgl; struct ib_sge *sge = &req->sge[1]; struct scatterlist *sgl; u32 len = 0; int i; for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) { sge->addr = sg_dma_address(sgl); sge->length = sg_dma_len(sgl); sge->lkey = queue->device->pd->local_dma_lkey; len += sge->length; sge++; } sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); sg->length = cpu_to_le32(len); sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; req->num_sge += count; return 0; } static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue, struct nvme_rdma_request *req, struct nvme_command *c) { struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl)); put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length); put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key); sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; return 0; } static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, struct nvme_rdma_request *req, struct nvme_command *c, int count) { struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; int nr; req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs); if (WARN_ON_ONCE(!req->mr)) return -EAGAIN; /* * Align the MR to a 4K page size to match the ctrl page size and * the block virtual boundary. */ nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL, SZ_4K); if (unlikely(nr < count)) { ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); req->mr = NULL; if (nr < 0) return nr; return -EINVAL; } ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); req->reg_cqe.done = nvme_rdma_memreg_done; memset(&req->reg_wr, 0, sizeof(req->reg_wr)); req->reg_wr.wr.opcode = IB_WR_REG_MR; req->reg_wr.wr.wr_cqe = &req->reg_cqe; req->reg_wr.wr.num_sge = 0; req->reg_wr.mr = req->mr; req->reg_wr.key = req->mr->rkey; req->reg_wr.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; sg->addr = cpu_to_le64(req->mr->iova); put_unaligned_le24(req->mr->length, sg->length); put_unaligned_le32(req->mr->rkey, sg->key); sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_INVALIDATE; return 0; } static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, struct nvme_command *cmd, struct ib_sig_domain *domain, u16 control, u8 pi_type) { domain->sig_type = IB_SIG_TYPE_T10_DIF; domain->sig.dif.bg_type = IB_T10DIF_CRC; domain->sig.dif.pi_interval = 1 << bi->interval_exp; domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; } static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs, u8 pi_type) { u16 control = le16_to_cpu(cmd->rw.control); memset(sig_attrs, 0, sizeof(*sig_attrs)); if (control & NVME_RW_PRINFO_PRACT) { /* for WRITE_INSERT/READ_STRIP no memory domain */ sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, pi_type); /* Clear the PRACT bit since HCA will generate/verify the PI */ control &= ~NVME_RW_PRINFO_PRACT; cmd->rw.control = cpu_to_le16(control); } else { /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, pi_type); nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, pi_type); } } static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask) { *mask = 0; if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF) *mask |= IB_SIG_CHECK_REFTAG; if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD) *mask |= IB_SIG_CHECK_GUARD; } static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc) { if (unlikely(wc->status != IB_WC_SUCCESS)) nvme_rdma_wr_error(cq, wc, "SIG"); } static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, struct nvme_rdma_request *req, struct nvme_command *c, int count, int pi_count) { struct nvme_rdma_sgl *sgl = &req->data_sgl; struct ib_reg_wr *wr = &req->reg_wr; struct request *rq = blk_mq_rq_from_pdu(req); struct nvme_ns *ns = rq->q->queuedata; struct bio *bio = rq->bio; struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); u32 xfer_len; int nr; req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs); if (WARN_ON_ONCE(!req->mr)) return -EAGAIN; nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL, req->metadata_sgl->sg_table.sgl, pi_count, NULL, SZ_4K); if (unlikely(nr)) goto mr_put; nvme_rdma_set_sig_attrs(bi, c, req->mr->sig_attrs, ns->head->pi_type); nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); req->reg_cqe.done = nvme_rdma_sig_done; memset(wr, 0, sizeof(*wr)); wr->wr.opcode = IB_WR_REG_MR_INTEGRITY; wr->wr.wr_cqe = &req->reg_cqe; wr->wr.num_sge = 0; wr->wr.send_flags = 0; wr->mr = req->mr; wr->key = req->mr->rkey; wr->access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; sg->addr = cpu_to_le64(req->mr->iova); xfer_len = req->mr->length; /* Check if PI is added by the HW */ if (!pi_count) xfer_len += (xfer_len >> bi->interval_exp) * ns->head->pi_size; put_unaligned_le24(xfer_len, sg->length); put_unaligned_le32(req->mr->rkey, sg->key); sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; return 0; mr_put: ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr); req->mr = NULL; if (nr < 0) return nr; return -EINVAL; } static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, int *count, int *pi_count) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int ret; req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1); ret = sg_alloc_table_chained(&req->data_sgl.sg_table, blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl, NVME_INLINE_SG_CNT); if (ret) return -ENOMEM; req->data_sgl.nents = blk_rq_map_sg(rq, req->data_sgl.sg_table.sgl); *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); if (unlikely(*count <= 0)) { ret = -EIO; goto out_free_table; } if (blk_integrity_rq(rq)) { req->metadata_sgl->sg_table.sgl = (struct scatterlist *)(req->metadata_sgl + 1); ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table, rq->nr_integrity_segments, req->metadata_sgl->sg_table.sgl, NVME_INLINE_METADATA_SG_CNT); if (unlikely(ret)) { ret = -ENOMEM; goto out_unmap_sg; } req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq, req->metadata_sgl->sg_table.sgl); *pi_count = ib_dma_map_sg(ibdev, req->metadata_sgl->sg_table.sgl, req->metadata_sgl->nents, rq_dma_dir(rq)); if (unlikely(*pi_count <= 0)) { ret = -EIO; goto out_free_pi_table; } } return 0; out_free_pi_table: sg_free_table_chained(&req->metadata_sgl->sg_table, NVME_INLINE_METADATA_SG_CNT); out_unmap_sg: ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); out_free_table: sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); return ret; } static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, struct request *rq, struct nvme_command *c) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; int pi_count = 0; int count, ret; req->num_sge = 1; refcount_set(&req->ref, 2); /* send and recv completions */ c->common.flags |= NVME_CMD_SGL_METABUF; if (!blk_rq_nr_phys_segments(rq)) return nvme_rdma_set_sg_null(c); ret = nvme_rdma_dma_map_req(ibdev, rq, &count, &pi_count); if (unlikely(ret)) return ret; if (req->use_sig_mr) { ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count); goto out; } if (count <= dev->num_inline_segments) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && queue->ctrl->use_inline_data && blk_rq_payload_bytes(rq) <= nvme_rdma_inline_data_size(queue)) { ret = nvme_rdma_map_sg_inline(queue, req, c, count); goto out; } if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { ret = nvme_rdma_map_sg_single(queue, req, c); goto out; } } ret = nvme_rdma_map_sg_fr(queue, req, c, count); out: if (unlikely(ret)) goto out_dma_unmap_req; return 0; out_dma_unmap_req: nvme_rdma_dma_unmap_req(ibdev, rq); return ret; } static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); struct nvme_rdma_request *req = container_of(qe, struct nvme_rdma_request, sqe); if (unlikely(wc->status != IB_WC_SUCCESS)) nvme_rdma_wr_error(cq, wc, "SEND"); else nvme_rdma_end_request(req); } static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, struct ib_send_wr *first) { struct ib_send_wr wr; int ret; sge->addr = qe->dma; sge->length = sizeof(struct nvme_command); sge->lkey = queue->device->pd->local_dma_lkey; wr.next = NULL; wr.wr_cqe = &qe->cqe; wr.sg_list = sge; wr.num_sge = num_sge; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; if (first) first->next = &wr; else first = &wr; ret = ib_post_send(queue->qp, first, NULL); if (unlikely(ret)) { dev_err(queue->ctrl->ctrl.device, "%s failed with error code %d\n", __func__, ret); } return ret; } static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe) { struct ib_recv_wr wr; struct ib_sge list; int ret; list.addr = qe->dma; list.length = sizeof(struct nvme_completion); list.lkey = queue->device->pd->local_dma_lkey; qe->cqe.done = nvme_rdma_recv_done; wr.next = NULL; wr.wr_cqe = &qe->cqe; wr.sg_list = &list; wr.num_sge = 1; ret = ib_post_recv(queue->qp, &wr, NULL); if (unlikely(ret)) { dev_err(queue->ctrl->ctrl.device, "%s failed with error code %d\n", __func__, ret); } return ret; } static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) { u32 queue_idx = nvme_rdma_queue_idx(queue); if (queue_idx == 0) return queue->ctrl->admin_tag_set.tags[queue_idx]; return queue->ctrl->tag_set.tags[queue_idx - 1]; } static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) { if (unlikely(wc->status != IB_WC_SUCCESS)) nvme_rdma_wr_error(cq, wc, "ASYNC"); } static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); struct nvme_rdma_queue *queue = &ctrl->queues[0]; struct ib_device *dev = queue->device->dev; struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe; struct nvme_command *cmd = sqe->data; struct ib_sge sge; int ret; ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); memset(cmd, 0, sizeof(*cmd)); cmd->common.opcode = nvme_admin_async_event; cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; cmd->common.flags |= NVME_CMD_SGL_METABUF; nvme_rdma_set_sg_null(cmd); sqe->cqe.done = nvme_rdma_async_done; ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL); WARN_ON_ONCE(ret); } static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, struct nvme_completion *cqe, struct ib_wc *wc) { struct request *rq; struct nvme_rdma_request *req; rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, "got bad command_id %#x on QP %#x\n", cqe->command_id, queue->qp->qp_num); nvme_rdma_error_recovery(queue->ctrl); return; } req = blk_mq_rq_to_pdu(rq); req->status = cqe->status; req->result = cqe->result; if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { if (unlikely(!req->mr || wc->ex.invalidate_rkey != req->mr->rkey)) { dev_err(queue->ctrl->ctrl.device, "Bogus remote invalidation for rkey %#x\n", req->mr ? req->mr->rkey : 0); nvme_rdma_error_recovery(queue->ctrl); } } else if (req->mr) { int ret; ret = nvme_rdma_inv_rkey(queue, req); if (unlikely(ret < 0)) { dev_err(queue->ctrl->ctrl.device, "Queueing INV WR for rkey %#x failed (%d)\n", req->mr->rkey, ret); nvme_rdma_error_recovery(queue->ctrl); } /* the local invalidation completion will end the request */ return; } nvme_rdma_end_request(req); } static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); struct nvme_rdma_queue *queue = wc->qp->qp_context; struct ib_device *ibdev = queue->device->dev; struct nvme_completion *cqe = qe->data; const size_t len = sizeof(struct nvme_completion); if (unlikely(wc->status != IB_WC_SUCCESS)) { nvme_rdma_wr_error(cq, wc, "RECV"); return; } /* sanity checking for received data length */ if (unlikely(wc->byte_len < len)) { dev_err(queue->ctrl->ctrl.device, "Unexpected nvme completion length(%d)\n", wc->byte_len); nvme_rdma_error_recovery(queue->ctrl); return; } ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); /* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue), cqe->command_id))) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else nvme_rdma_process_nvme_rsp(queue, cqe, wc); ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); nvme_rdma_post_recv(queue, qe); } static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) { int ret, i; for (i = 0; i < queue->queue_size; i++) { ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]); if (ret) return ret; } return 0; } static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, struct rdma_cm_event *ev) { struct rdma_cm_id *cm_id = queue->cm_id; int status = ev->status; const char *rej_msg; const struct nvme_rdma_cm_rej *rej_data; u8 rej_data_len; rej_msg = rdma_reject_msg(cm_id, status); rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len); if (rej_data && rej_data_len >= sizeof(u16)) { u16 sts = le16_to_cpu(rej_data->sts); dev_err(queue->ctrl->ctrl.device, "Connect rejected: status %d (%s) nvme status %d (%s).\n", status, rej_msg, sts, nvme_rdma_cm_msg(sts)); } else { dev_err(queue->ctrl->ctrl.device, "Connect rejected: status %d (%s).\n", status, rej_msg); } return -ECONNRESET; } static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) { struct nvme_ctrl *ctrl = &queue->ctrl->ctrl; int ret; ret = nvme_rdma_create_queue_ib(queue); if (ret) return ret; if (ctrl->opts->tos >= 0) rdma_set_service_type(queue->cm_id, ctrl->opts->tos); ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS); if (ret) { dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", queue->cm_error); goto out_destroy_queue; } return 0; out_destroy_queue: nvme_rdma_destroy_queue_ib(queue); return ret; } static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) { struct nvme_rdma_ctrl *ctrl = queue->ctrl; struct rdma_conn_param param = { }; struct nvme_rdma_cm_req priv = { }; int ret; param.qp_num = queue->qp->qp_num; param.flow_control = 1; param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom; /* maximum retry count */ param.retry_count = 7; param.rnr_retry_count = 7; param.private_data = &priv; param.private_data_len = sizeof(priv); priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue)); /* * set the admin queue depth to the minimum size * specified by the Fabrics standard. */ if (priv.qid == 0) { priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH); priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); } else { /* * current interpretation of the fabrics spec * is at minimum you make hrqsize sqsize+1, or a * 1's based representation of sqsize. */ priv.hrqsize = cpu_to_le16(queue->queue_size); priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); /* cntlid should only be set when creating an I/O queue */ priv.cntlid = cpu_to_le16(ctrl->ctrl.cntlid); } ret = rdma_connect_locked(queue->cm_id, &param); if (ret) { dev_err(ctrl->ctrl.device, "rdma_connect_locked failed (%d).\n", ret); return ret; } return 0; } static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *ev) { struct nvme_rdma_queue *queue = cm_id->context; int cm_error = 0; dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n", rdma_event_msg(ev->event), ev->event, ev->status, cm_id); switch (ev->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cm_error = nvme_rdma_addr_resolved(queue); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cm_error = nvme_rdma_route_resolved(queue); break; case RDMA_CM_EVENT_ESTABLISHED: queue->cm_error = nvme_rdma_conn_established(queue); /* complete cm_done regardless of success/failure */ complete(&queue->cm_done); return 0; case RDMA_CM_EVENT_REJECTED: cm_error = nvme_rdma_conn_rejected(queue, ev); break; case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); cm_error = -ECONNRESET; break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_TIMEWAIT_EXIT: dev_dbg(queue->ctrl->ctrl.device, "disconnect received - connection closed\n"); nvme_rdma_error_recovery(queue->ctrl); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: /* device removal is handled via the ib_client API */ break; default: dev_err(queue->ctrl->ctrl.device, "Unexpected RDMA CM event (%d)\n", ev->event); nvme_rdma_error_recovery(queue->ctrl); break; } if (cm_error) { queue->cm_error = cm_error; complete(&queue->cm_done); } return 0; } static void nvme_rdma_complete_timed_out(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_queue *queue = req->queue; nvme_rdma_stop_queue(queue); nvmf_complete_timed_out_request(rq); } static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_queue *queue = req->queue; struct nvme_rdma_ctrl *ctrl = queue->ctrl; struct nvme_command *cmd = req->req.cmd; int qid = nvme_rdma_queue_idx(queue); dev_warn(ctrl->ctrl.device, "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout\n", rq->tag, nvme_cid(rq), cmd->common.opcode, nvme_fabrics_opcode_str(qid, cmd), qid); if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_LIVE) { /* * If we are resetting, connecting or deleting we should * complete immediately because we may block controller * teardown or setup sequence * - ctrl disable/shutdown fabrics requests * - connect requests * - initialization admin requests * - I/O requests that entered after unquiescing and * the controller stopped responding * * All other requests should be cancelled by the error * recovery work, so it's fine that we fail it here. */ nvme_rdma_complete_timed_out(rq); return BLK_EH_DONE; } /* * LIVE state should trigger the normal error recovery which will * handle completing this request. */ nvme_rdma_error_recovery(ctrl); return BLK_EH_RESET_TIMER; } static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_rdma_queue *queue = hctx->driver_data; struct request *rq = bd->rq; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_qe *sqe = &req->sqe; struct nvme_command *c = nvme_req(rq)->cmd; struct ib_device *dev; bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); blk_status_t ret; int err; WARN_ON_ONCE(rq->tag < 0); if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); dev = queue->device->dev; req->sqe.dma = ib_dma_map_single(dev, req->sqe.data, sizeof(struct nvme_command), DMA_TO_DEVICE); err = ib_dma_mapping_error(dev, req->sqe.dma); if (unlikely(err)) return BLK_STS_RESOURCE; ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); ret = nvme_setup_cmd(ns, rq); if (ret) goto unmap_qe; nvme_start_request(rq); if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && queue->pi_support && (c->common.opcode == nvme_cmd_write || c->common.opcode == nvme_cmd_read) && nvme_ns_has_pi(ns->head)) req->use_sig_mr = true; else req->use_sig_mr = false; err = nvme_rdma_map_data(queue, rq, c); if (unlikely(err < 0)) { dev_err(queue->ctrl->ctrl.device, "Failed to map data (%d)\n", err); goto err; } sqe->cqe.done = nvme_rdma_send_done; ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr ? &req->reg_wr.wr : NULL); if (unlikely(err)) goto err_unmap; return BLK_STS_OK; err_unmap: nvme_rdma_unmap_data(queue, rq); err: if (err == -EIO) ret = nvme_host_path_error(rq); else if (err == -ENOMEM || err == -EAGAIN) ret = BLK_STS_RESOURCE; else ret = BLK_STS_IOERR; nvme_cleanup_cmd(rq); unmap_qe: ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command), DMA_TO_DEVICE); return ret; } static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_rdma_queue *queue = hctx->driver_data; return ib_process_cq_direct(queue->ib_cq, -1); } static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req) { struct request *rq = blk_mq_rq_from_pdu(req); struct ib_mr_status mr_status; int ret; ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status); if (ret) { pr_err("ib_check_mr_status failed, ret %d\n", ret); nvme_req(rq)->status = NVME_SC_INVALID_PI; return; } if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { switch (mr_status.sig_err.err_type) { case IB_SIG_BAD_GUARD: nvme_req(rq)->status = NVME_SC_GUARD_CHECK; break; case IB_SIG_BAD_REFTAG: nvme_req(rq)->status = NVME_SC_REFTAG_CHECK; break; case IB_SIG_BAD_APPTAG: nvme_req(rq)->status = NVME_SC_APPTAG_CHECK; break; } pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n", mr_status.sig_err.err_type, mr_status.sig_err.expected, mr_status.sig_err.actual); } } static void nvme_rdma_complete_rq(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_queue *queue = req->queue; struct ib_device *ibdev = queue->device->dev; if (req->use_sig_mr) nvme_rdma_check_pi_status(req); nvme_rdma_unmap_data(queue, rq); ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command), DMA_TO_DEVICE); nvme_complete_rq(rq); } static void nvme_rdma_map_queues(struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data); nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues); } static const struct blk_mq_ops nvme_rdma_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, .init_request = nvme_rdma_init_request, .exit_request = nvme_rdma_exit_request, .init_hctx = nvme_rdma_init_hctx, .timeout = nvme_rdma_timeout, .map_queues = nvme_rdma_map_queues, .poll = nvme_rdma_poll, }; static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, .init_request = nvme_rdma_init_request, .exit_request = nvme_rdma_exit_request, .init_hctx = nvme_rdma_init_admin_hctx, .timeout = nvme_rdma_timeout, }; static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { nvme_rdma_teardown_io_queues(ctrl, shutdown); nvme_quiesce_admin_queue(&ctrl->ctrl); nvme_disable_ctrl(&ctrl->ctrl, shutdown); nvme_rdma_teardown_admin_queue(ctrl, shutdown); } static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) { nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); int ret; nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure should never happen */ WARN_ON_ONCE(1); return; } ret = nvme_rdma_setup_ctrl(ctrl, false); if (ret) goto out_fail; return; out_fail: ++ctrl->ctrl.nr_reconnects; nvme_rdma_reconnect_or_remove(ctrl, ret); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .name = "rdma", .module = THIS_MODULE, .flags = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, .subsystem_reset = nvmf_subsystem_reset, .free_ctrl = nvme_rdma_free_ctrl, .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_delete_ctrl, .get_address = nvmf_get_address, .stop_ctrl = nvme_rdma_stop_ctrl, }; /* * Fails a connection request if it matches an existing controller * (association) with the same tuple: * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN> * * if local address is not specified in the request, it will match an * existing controller with all the other parameters the same and no * local port address specified as well. * * The ports don't need to be compared as they are intrinsically * already matched by the port pointers supplied. */ static bool nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts) { struct nvme_rdma_ctrl *ctrl; bool found = false; mutex_lock(&nvme_rdma_ctrl_mutex); list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { found = nvmf_ip_options_match(&ctrl->ctrl, opts); if (found) break; } mutex_unlock(&nvme_rdma_ctrl_mutex); return found; } static struct nvme_rdma_ctrl *nvme_rdma_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_rdma_ctrl *ctrl; int ret; ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) return ERR_PTR(-ENOMEM); ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL); if (!opts->trsvcid) { ret = -ENOMEM; goto out_free_ctrl; } opts->mask |= NVMF_OPT_TRSVCID; } ret = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->traddr, opts->trsvcid, &ctrl->addr); if (ret) { pr_err("malformed address passed: %s:%s\n", opts->traddr, opts->trsvcid); goto out_free_ctrl; } if (opts->mask & NVMF_OPT_HOST_TRADDR) { ret = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->host_traddr, NULL, &ctrl->src_addr); if (ret) { pr_err("malformed src address passed: %s\n", opts->host_traddr); goto out_free_ctrl; } } if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) { ret = -EALREADY; goto out_free_ctrl; } INIT_DELAYED_WORK(&ctrl->reconnect_work, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ret = -ENOMEM; ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), GFP_KERNEL); if (!ctrl->queues) goto out_free_ctrl; ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) goto out_kfree_queues; return ctrl; out_kfree_queues: kfree(ctrl->queues); out_free_ctrl: kfree(ctrl); return ERR_PTR(ret); } static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_rdma_ctrl *ctrl; bool changed; int ret; ctrl = nvme_rdma_alloc_ctrl(dev, opts); if (IS_ERR(ctrl)) return ERR_CAST(ctrl); ret = nvme_add_ctrl(&ctrl->ctrl); if (ret) goto out_put_ctrl; changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); WARN_ON_ONCE(!changed); ret = nvme_rdma_setup_ctrl(ctrl, true); if (ret) goto out_uninit_ctrl; dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs, hostnqn: %s\n", nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn); mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); return &ctrl->ctrl; out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); } static struct nvmf_transport_ops nvme_rdma_transport = { .name = "rdma", .module = THIS_MODULE, .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | NVMF_OPT_TOS, .create_ctrl = nvme_rdma_create_ctrl, }; static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) { struct nvme_rdma_ctrl *ctrl; struct nvme_rdma_device *ndev; bool found = false; mutex_lock(&device_list_mutex); list_for_each_entry(ndev, &device_list, entry) { if (ndev->dev == ib_device) { found = true; break; } } mutex_unlock(&device_list_mutex); if (!found) return; /* Delete all controllers using this device */ mutex_lock(&nvme_rdma_ctrl_mutex); list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { if (ctrl->device->dev != ib_device) continue; nvme_delete_ctrl(&ctrl->ctrl); } mutex_unlock(&nvme_rdma_ctrl_mutex); flush_workqueue(nvme_delete_wq); } static struct ib_client nvme_rdma_ib_client = { .name = "nvme_rdma", .remove = nvme_rdma_remove_one }; static int __init nvme_rdma_init_module(void) { int ret; ret = ib_register_client(&nvme_rdma_ib_client); if (ret) return ret; ret = nvmf_register_transport(&nvme_rdma_transport); if (ret) goto err_unreg_client; return 0; err_unreg_client: ib_unregister_client(&nvme_rdma_ib_client); return ret; } static void __exit nvme_rdma_cleanup_module(void) { struct nvme_rdma_ctrl *ctrl; nvmf_unregister_transport(&nvme_rdma_transport); ib_unregister_client(&nvme_rdma_ib_client); mutex_lock(&nvme_rdma_ctrl_mutex); list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) nvme_delete_ctrl(&ctrl->ctrl); mutex_unlock(&nvme_rdma_ctrl_mutex); flush_workqueue(nvme_delete_wq); } module_init(nvme_rdma_init_module); module_exit(nvme_rdma_cleanup_module); MODULE_DESCRIPTION("NVMe host RDMA transport driver"); MODULE_LICENSE("GPL v2");
20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 // SPDX-License-Identifier: GPL-2.0-only /* * Yama Linux Security Module * * Author: Kees Cook <keescook@chromium.org> * * Copyright (C) 2010 Canonical, Ltd. * Copyright (C) 2011 The Chromium OS Authors. */ #include <linux/lsm_hooks.h> #include <linux/sysctl.h> #include <linux/ptrace.h> #include <linux/prctl.h> #include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/string_helpers.h> #include <linux/task_work.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <uapi/linux/lsm.h> #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 #define YAMA_SCOPE_CAPABILITY 2 #define YAMA_SCOPE_NO_ATTACH 3 static int ptrace_scope = YAMA_SCOPE_RELATIONAL; /* describe a ptrace relationship for potential exception */ struct ptrace_relation { struct task_struct *tracer; struct task_struct *tracee; bool invalid; struct list_head node; struct rcu_head rcu; }; static LIST_HEAD(ptracer_relations); static DEFINE_SPINLOCK(ptracer_relations_lock); static void yama_relation_cleanup(struct work_struct *work); static DECLARE_WORK(yama_relation_work, yama_relation_cleanup); struct access_report_info { struct callback_head work; const char *access; struct task_struct *target; struct task_struct *agent; }; static void __report_access(struct callback_head *work) { struct access_report_info *info = container_of(work, struct access_report_info, work); char *target_cmd, *agent_cmd; target_cmd = kstrdup_quotable_cmdline(info->target, GFP_KERNEL); agent_cmd = kstrdup_quotable_cmdline(info->agent, GFP_KERNEL); pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", info->access, target_cmd, info->target->pid, agent_cmd, info->agent->pid); kfree(agent_cmd); kfree(target_cmd); put_task_struct(info->agent); put_task_struct(info->target); kfree(info); } /* defers execution because cmdline access can sleep */ static void report_access(const char *access, struct task_struct *target, struct task_struct *agent) { struct access_report_info *info; assert_spin_locked(&target->alloc_lock); /* for target->comm */ if (current->flags & PF_KTHREAD) { /* I don't think kthreads call task_work_run() before exiting. * Imagine angry ranting about procfs here. */ pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", access, target->comm, target->pid, agent->comm, agent->pid); return; } info = kmalloc(sizeof(*info), GFP_ATOMIC); if (!info) return; init_task_work(&info->work, __report_access); get_task_struct(target); get_task_struct(agent); info->access = access; info->target = target; info->agent = agent; if (task_work_add(current, &info->work, TWA_RESUME) == 0) return; /* success */ WARN(1, "report_access called from exiting task"); put_task_struct(target); put_task_struct(agent); kfree(info); } /** * yama_relation_cleanup - remove invalid entries from the relation list * @work: unused * */ static void yama_relation_cleanup(struct work_struct *work) { struct ptrace_relation *relation; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) { list_del_rcu(&relation->node); kfree_rcu(relation, rcu); } } rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); } /** * yama_ptracer_add - add/replace an exception for this tracer/tracee pair * @tracer: the task_struct of the process doing the ptrace * @tracee: the task_struct of the process to be ptraced * * Each tracee can have, at most, one tracer registered. Each time this * is called, the prior registered tracer will be replaced for the tracee. * * Returns 0 if relationship was added, -ve on error. */ static int yama_ptracer_add(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation, *added; added = kmalloc(sizeof(*added), GFP_KERNEL); if (!added) return -ENOMEM; added->tracee = tracee; added->tracer = tracer; added->invalid = false; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { list_replace_rcu(&relation->node, &added->node); kfree_rcu(relation, rcu); goto out; } } list_add_rcu(&added->node, &ptracer_relations); out: rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); return 0; } /** * yama_ptracer_del - remove exceptions related to the given tasks * @tracer: remove any relation where tracer task matches * @tracee: remove any relation where tracee task matches */ static void yama_ptracer_del(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation; bool marked = false; rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee || (tracer && relation->tracer == tracer)) { relation->invalid = true; marked = true; } } rcu_read_unlock(); if (marked) schedule_work(&yama_relation_work); } /** * yama_task_free - check for task_pid to remove from exception list * @task: task being removed */ static void yama_task_free(struct task_struct *task) { yama_ptracer_del(task, task); } /** * yama_task_prctl - check for Yama-specific prctl operations * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Return 0 on success, -ve on error. -ENOSYS is returned when Yama * does not handle the given option. */ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int rc = -ENOSYS; struct task_struct *myself; switch (option) { case PR_SET_PTRACER: /* Since a thread can call prctl(), find the group leader * before calling _add() or _del() on it, since we want * process-level granularity of control. The tracer group * leader checking is handled later when walking the ancestry * at the time of PTRACE_ATTACH check. */ myself = current->group_leader; if (arg2 == 0) { yama_ptracer_del(NULL, myself); rc = 0; } else if (arg2 == PR_SET_PTRACER_ANY || (int)arg2 == -1) { rc = yama_ptracer_add(NULL, myself); } else { struct task_struct *tracer; tracer = find_get_task_by_vpid(arg2); if (!tracer) { rc = -EINVAL; } else { rc = yama_ptracer_add(tracer, myself); put_task_struct(tracer); } } break; } return rc; } /** * task_is_descendant - walk up a process family tree looking for a match * @parent: the process to compare against while walking up from child * @child: the process to start from while looking upwards for parent * * Returns 1 if child is a descendant of parent, 0 if not. */ static int task_is_descendant(struct task_struct *parent, struct task_struct *child) { int rc = 0; struct task_struct *walker = child; if (!parent || !child) return 0; rcu_read_lock(); if (!thread_group_leader(parent)) parent = rcu_dereference(parent->group_leader); while (walker->pid > 0) { if (!thread_group_leader(walker)) walker = rcu_dereference(walker->group_leader); if (walker == parent) { rc = 1; break; } walker = rcu_dereference(walker->real_parent); } rcu_read_unlock(); return rc; } /** * ptracer_exception_found - tracer registered as exception for this tracee * @tracer: the task_struct of the process attempting ptrace * @tracee: the task_struct of the process to be ptraced * * Returns 1 if tracer has a ptracer exception ancestor for tracee. */ static int ptracer_exception_found(struct task_struct *tracer, struct task_struct *tracee) { int rc = 0; struct ptrace_relation *relation; struct task_struct *parent = NULL; bool found = false; rcu_read_lock(); /* * If there's already an active tracing relationship, then make an * exception for the sake of other accesses, like process_vm_rw(). */ parent = ptrace_parent(tracee); if (parent != NULL && same_thread_group(parent, tracer)) { rc = 1; goto unlock; } /* Look for a PR_SET_PTRACER relationship. */ if (!thread_group_leader(tracee)) tracee = rcu_dereference(tracee->group_leader); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { parent = relation->tracer; found = true; break; } } if (found && (parent == NULL || task_is_descendant(parent, tracer))) rc = 1; unlock: rcu_read_unlock(); return rc; } /** * yama_ptrace_access_check - validate PTRACE_ATTACH calls * @child: task that current task is attempting to ptrace * @mode: ptrace attach mode * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc = 0; /* require ptrace target be a child of ptracer on attach */ if (mode & PTRACE_MODE_ATTACH) { switch (ptrace_scope) { case YAMA_SCOPE_DISABLED: /* No additional restrictions. */ break; case YAMA_SCOPE_RELATIONAL: rcu_read_lock(); if (!pid_alive(child)) rc = -EPERM; if (!rc && !task_is_descendant(current, child) && !ptracer_exception_found(current, child) && !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_CAPABILITY: rcu_read_lock(); if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: default: rc = -EPERM; break; } } if (rc && (mode & PTRACE_MODE_NOAUDIT) == 0) report_access("attach", child, current); return rc; } /** * yama_ptrace_traceme - validate PTRACE_TRACEME calls * @parent: task that will become the ptracer of the current task * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_traceme(struct task_struct *parent) { int rc = 0; /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: if (!has_ns_capability(parent, current_user_ns(), CAP_SYS_PTRACE)) rc = -EPERM; break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM; break; } if (rc) { task_lock(current); report_access("traceme", current, parent); task_unlock(current); } return rc; } static const struct lsm_id yama_lsmid = { .name = "yama", .id = LSM_ID_YAMA, }; static struct security_hook_list yama_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme), LSM_HOOK_INIT(task_prctl, yama_task_prctl), LSM_HOOK_INIT(task_free, yama_task_free), }; #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; if (write && !capable(CAP_SYS_PTRACE)) return -EPERM; /* Lock the max value if it ever gets set. */ table_copy = *table; if (*(int *)table_copy.data == *(int *)table_copy.extra2) table_copy.extra1 = table_copy.extra2; return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); } static int max_scope = YAMA_SCOPE_NO_ATTACH; static const struct ctl_table yama_sysctl_table[] = { { .procname = "ptrace_scope", .data = &ptrace_scope, .maxlen = sizeof(int), .mode = 0644, .proc_handler = yama_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &max_scope, }, }; static void __init yama_init_sysctl(void) { if (!register_sysctl("kernel/yama", yama_sysctl_table)) panic("Yama: sysctl registration failed.\n"); } #else static inline void yama_init_sysctl(void) { } #endif /* CONFIG_SYSCTL */ static int __init yama_init(void) { pr_info("Yama: becoming mindful.\n"); security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), &yama_lsmid); yama_init_sysctl(); return 0; } DEFINE_LSM(yama) = { .name = "yama", .init = yama_init, };
57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_CHECKSUM_H #define __ASM_GENERIC_CHECKSUM_H #include <linux/bitops.h> /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); #endif #ifndef csum_fold /* * Fold a partial checksum */ static inline __sum16 csum_fold(__wsum csum) { u32 sum = (__force u32)csum; return (__force __sum16)((~sum - ror32(sum, 16)) >> 16); } #endif #ifndef csum_tcpudp_nofold /* * computes the checksum of the TCP/UDP pseudo-header * returns a 16-bit checksum, already complemented */ extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum); #endif #ifndef csum_tcpudp_magic static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } #endif /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ extern __sum16 ip_compute_csum(const void *buff, int len); #endif /* __ASM_GENERIC_CHECKSUM_H */
16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FIRMWARE_LOADER_H #define __FIRMWARE_LOADER_H #include <linux/bitops.h> #include <linux/firmware.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/completion.h> /** * enum fw_opt - options to control firmware loading behaviour * * @FW_OPT_UEVENT: Enables the fallback mechanism to send a kobject uevent * when the firmware is not found. Userspace is in charge to load the * firmware using the sysfs loading facility. * @FW_OPT_NOWAIT: Used to describe the firmware request is asynchronous. * @FW_OPT_USERHELPER: Enable the fallback mechanism, in case the direct * filesystem lookup fails at finding the firmware. For details refer to * firmware_fallback_sysfs(). * @FW_OPT_NO_WARN: Quiet, avoid printing warning messages. * @FW_OPT_NOCACHE: Disables firmware caching. Firmware caching is used to * cache the firmware upon suspend, so that upon resume races against the * firmware file lookup on storage is avoided. Used for calls where the * file may be too big, or where the driver takes charge of its own * firmware caching mechanism. * @FW_OPT_NOFALLBACK_SYSFS: Disable the sysfs fallback mechanism. Takes * precedence over &FW_OPT_UEVENT and &FW_OPT_USERHELPER. * @FW_OPT_FALLBACK_PLATFORM: Enable fallback to device fw copy embedded in * the platform's main firmware. If both this fallback and the sysfs * fallback are enabled, then this fallback will be tried first. * @FW_OPT_PARTIAL: Allow partial read of firmware instead of needing to read * entire file. */ enum fw_opt { FW_OPT_UEVENT = BIT(0), FW_OPT_NOWAIT = BIT(1), FW_OPT_USERHELPER = BIT(2), FW_OPT_NO_WARN = BIT(3), FW_OPT_NOCACHE = BIT(4), FW_OPT_NOFALLBACK_SYSFS = BIT(5), FW_OPT_FALLBACK_PLATFORM = BIT(6), FW_OPT_PARTIAL = BIT(7), }; enum fw_status { FW_STATUS_UNKNOWN, FW_STATUS_LOADING, FW_STATUS_DONE, FW_STATUS_ABORTED, }; /* * Concurrent request_firmware() for the same firmware need to be * serialized. struct fw_state is simple state machine which hold the * state of the firmware loading. */ struct fw_state { struct completion completion; enum fw_status status; }; struct fw_priv { struct kref ref; struct list_head list; struct firmware_cache *fwc; struct fw_state fw_st; void *data; size_t size; size_t allocated_size; size_t offset; u32 opt_flags; #ifdef CONFIG_FW_LOADER_PAGED_BUF bool is_paged_buf; struct page **pages; int nr_pages; int page_array_size; #endif #ifdef CONFIG_FW_LOADER_USER_HELPER bool need_uevent; struct list_head pending_list; #endif const char *fw_name; }; extern struct mutex fw_lock; extern struct firmware_cache fw_cache; extern bool fw_load_abort_all; static inline bool __fw_state_check(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; return fw_st->status == status; } static inline int __fw_state_wait_common(struct fw_priv *fw_priv, long timeout) { struct fw_state *fw_st = &fw_priv->fw_st; long ret; ret = wait_for_completion_killable_timeout(&fw_st->completion, timeout); if (ret != 0 && fw_st->status == FW_STATUS_ABORTED) return -ENOENT; if (!ret) return -ETIMEDOUT; return ret < 0 ? ret : 0; } static inline void __fw_state_set(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; WRITE_ONCE(fw_st->status, status); if (status == FW_STATUS_DONE || status == FW_STATUS_ABORTED) { #ifdef CONFIG_FW_LOADER_USER_HELPER /* * Doing this here ensures that the fw_priv is deleted from * the pending list in all abort/done paths. */ list_del_init(&fw_priv->pending_list); #endif complete_all(&fw_st->completion); } } static inline void fw_state_aborted(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_ABORTED); } static inline bool fw_state_is_aborted(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_ABORTED); } static inline void fw_state_start(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_LOADING); } static inline void fw_state_done(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_done(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_loading(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_LOADING); } int alloc_lookup_fw_priv(const char *fw_name, struct firmware_cache *fwc, struct fw_priv **fw_priv, void *dbuf, size_t size, size_t offset, u32 opt_flags); int assign_fw(struct firmware *fw, struct device *device); void free_fw_priv(struct fw_priv *fw_priv); void fw_state_init(struct fw_priv *fw_priv); #ifdef CONFIG_FW_LOADER bool firmware_is_builtin(const struct firmware *fw); bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size); #else /* module case */ static inline bool firmware_is_builtin(const struct firmware *fw) { return false; } static inline bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size) { return false; } #endif #ifdef CONFIG_FW_LOADER_PAGED_BUF void fw_free_paged_buf(struct fw_priv *fw_priv); int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed); int fw_map_paged_buf(struct fw_priv *fw_priv); bool fw_is_paged_buf(struct fw_priv *fw_priv); #else static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {} static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) { return -ENXIO; } static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; } static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; } #endif #endif /* __FIRMWARE_LOADER_H */
24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 #include <linux/export.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/vmalloc.h> /* Allocate an array of spinlocks to be accessed by a hash. Two arguments * indicate the number of elements to allocate in the array. max_size * gives the maximum number of elements to allocate. cpu_mult gives * the number of locks per CPU to allocate. The size is rounded up * to a power of 2 to be suitable as a hash table. */ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp, const char *name, struct lock_class_key *key) { spinlock_t *tlocks = NULL; unsigned int i, size; #if defined(CONFIG_PROVE_LOCKING) unsigned int nr_pcpus = 2; #else unsigned int nr_pcpus = num_possible_cpus(); #endif if (cpu_mult) { nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); size = min_t(unsigned int, nr_pcpus * cpu_mult, max_size); } else { size = max_size; } if (sizeof(spinlock_t) != 0) { tlocks = kvmalloc_array(size, sizeof(spinlock_t), gfp); if (!tlocks) return -ENOMEM; for (i = 0; i < size; i++) { spin_lock_init(&tlocks[i]); lockdep_init_map(&tlocks[i].dep_map, name, key, 0); } } *locks = tlocks; *locks_mask = size - 1; return 0; } EXPORT_SYMBOL(__alloc_bucket_spinlocks); void free_bucket_spinlocks(spinlock_t *locks) { kvfree(locks); } EXPORT_SYMBOL(free_bucket_spinlocks);
90 11 87 523 521 520 57 523 520 46 468 469 445 87 470 469 470 112 112 112 112 112 24 24 45 46 46 4 24 24 46 46 4 45 45 45 45 45 45 378 3 380 378 11 379 6379 6328 380 380 1 375 376 376 1 89 90 90 90 90 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> #include "slab.h" #include "internal.h" #ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); static inline bool list_lru_memcg_aware(struct list_lru *lru) { return lru->memcg_aware; } static void list_lru_register(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } static int lru_shrinker_id(struct list_lru *lru) { return lru->shrinker_id; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { if (list_lru_memcg_aware(lru) && idx >= 0) { struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); return mlru ? &mlru->node[nid] : NULL; } return &lru->node[nid].lru; } static inline bool lock_list_lru(struct list_lru_one *l, bool irq) { if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { if (irq) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); return false; } return true; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l; rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); if (likely(l) && lock_list_lru(l, irq)) { rcu_read_unlock(); return l; } /* * Caller may simply bail out if raced with reparenting or * may iterate through the list_lru and expect empty slots. */ if (skip_empty) { rcu_read_unlock(); return NULL; } VM_WARN_ON(!css_is_dying(&memcg->css)); memcg = parent_mem_cgroup(memcg); goto again; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #else static void list_lru_register(struct list_lru *lru) { } static void list_lru_unregister(struct list_lru *lru) { } static int lru_shrinker_id(struct list_lru *lru) { return -1; } static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } static inline struct list_lru_one * lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, bool irq, bool skip_empty) { struct list_lru_one *l = &lru->node[nid].lru; if (irq) spin_lock_irq(&l->lock); else spin_lock(&l->lock); return l; } static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) { if (irq_off) spin_unlock_irq(&l->lock); else spin_unlock(&l->lock); } #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (list_empty(item)) { list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); unlock_list_lru(l, false); atomic_long_inc(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_add(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_add_obj); /* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); if (!l) return false; if (!list_empty(item)) { list_del_init(item); l->nr_items--; unlock_list_lru(l, false); atomic_long_dec(&nlru->nr_items); return true; } unlock_list_lru(l, false); return false; } bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { bool ret; int nid = page_to_nid(virt_to_page(item)); if (list_lru_memcg_aware(lru)) { rcu_read_lock(); ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); rcu_read_unlock(); } else { ret = list_lru_del(lru, item, nid, NULL); } return ret; } EXPORT_SYMBOL_GPL(list_lru_del_obj); void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head) { list_move(item, head); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate_move); unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { struct list_lru_one *l; long count; rcu_read_lock(); l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); count = l ? READ_ONCE(l->nr_items) : 0; rcu_read_unlock(); if (unlikely(count < 0)) count = 0; return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { struct list_lru_node *nlru; nlru = &lru->node[nid]; return atomic_long_read(&nlru->nr_items); } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk, bool irq_off) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l = NULL; struct list_head *item, *n; unsigned long isolated = 0; restart: l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); if (!l) return isolated; list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; --*nr_to_walk; ret = isolate(item, l, cb_arg); switch (ret) { /* * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru * lock. List traversal will have to restart from scratch. */ case LRU_RETRY: goto restart; case LRU_REMOVED_RETRY: fallthrough; case LRU_REMOVED: isolated++; atomic_long_dec(&nlru->nr_items); if (ret == LRU_REMOVED_RETRY) goto restart; break; case LRU_ROTATE: list_move_tail(item, &l->list); break; case LRU_SKIP: break; case LRU_STOP: goto out; default: BUG(); } } unlock_list_lru(l, irq_off); out: return isolated; } unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); } EXPORT_SYMBOL_GPL(list_lru_walk_one); unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { return __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, true); } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { long isolated = 0; isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); #ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; struct mem_cgroup *memcg; unsigned long index; xa_for_each(&lru->xa, index, mlru) { rcu_read_lock(); memcg = mem_cgroup_from_id(index); if (!mem_cgroup_tryget(memcg)) { rcu_read_unlock(); continue; } rcu_read_unlock(); isolated += __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, nr_to_walk, false); mem_cgroup_put(memcg); if (*nr_to_walk <= 0) break; } } #endif return isolated; } EXPORT_SYMBOL_GPL(list_lru_walk_node); static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); spin_lock_init(&l->lock); l->nr_items = 0; #ifdef CONFIG_LOCKDEP if (lru->key) lockdep_set_class(&l->lock, lru->key); #endif } #ifdef CONFIG_MEMCG static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) { int nid; struct list_lru_memcg *mlru; mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); if (!mlru) return NULL; for_each_node(nid) init_one_lru(lru, &mlru->node[nid]); return mlru; } static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); lru->memcg_aware = memcg_aware; } static void memcg_destroy_list_lru(struct list_lru *lru) { XA_STATE(xas, &lru->xa, 0); struct list_lru_memcg *mlru; if (!list_lru_memcg_aware(lru)) return; xas_lock_irq(&xas); xas_for_each(&xas, mlru, ULONG_MAX) { kfree(mlru); xas_store(&xas, NULL); } xas_unlock_irq(&xas); } static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, struct list_lru_one *src, struct mem_cgroup *dst_memcg) { int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *dst; spin_lock_irq(&src->lock); dst = list_lru_from_memcg_idx(lru, nid, dst_idx); spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); list_splice_init(&src->list, &dst->list); if (src->nr_items) { WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } /* Mark the list_lru_one dead */ src->nr_items = LONG_MIN; spin_unlock(&dst->lock); spin_unlock_irq(&src->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { struct list_lru *lru; int i; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) { struct list_lru_memcg *mlru; XA_STATE(xas, &lru->xa, memcg->kmemcg_id); /* * Lock the Xarray to ensure no on going list_lru_memcg * allocation and further allocation will see css_is_dying(). */ xas_lock_irq(&xas); mlru = xas_store(&xas, NULL); xas_unlock_irq(&xas); if (!mlru) continue; /* * With Xarray value set to NULL, holding the lru lock below * prevents list_lru_{add,del,isolate} from touching the lru, * safe to reparent. */ for_each_node(i) memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed * to remain empty, we can safely free this lru, any further * memcg_list_lru_alloc() call will simply bail out. */ kvfree_rcu(mlru, rcu); } mutex_unlock(&list_lrus_mutex); } static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, struct list_lru *lru) { int idx = memcg->kmemcg_id; return idx < 0 || xa_load(&lru->xa, idx); } int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { unsigned long flags; struct list_lru_memcg *mlru = NULL; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ do { /* * Keep finding the farest parent that wasn't populated * until found memcg itself. */ pos = memcg; parent = parent_mem_cgroup(pos); while (!memcg_list_lru_allocated(parent, lru)) { pos = parent; parent = parent_mem_cgroup(pos); } if (!mlru) { mlru = memcg_init_list_lru_one(lru, gfp); if (!mlru) return -ENOMEM; } xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); if (!xas_load(&xas) && !css_is_dying(&pos->css)) { xas_store(&xas, mlru); if (!xas_error(&xas)) mlru = NULL; } xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); } while (pos != memcg && !css_is_dying(&pos->css)); if (unlikely(mlru)) kfree(mlru); return xas_error(&xas); } #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { } static void memcg_destroy_list_lru(struct list_lru *lru) { } #endif /* CONFIG_MEMCG */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker) { int i; #ifdef CONFIG_MEMCG if (shrinker) lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; if (mem_cgroup_kmem_disabled()) memcg_aware = false; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) return -ENOMEM; for_each_node(i) init_one_lru(lru, &lru->node[i].lru); memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); return 0; } EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { /* Already destroyed or not yet initialized? */ if (!lru->node) return; list_lru_unregister(lru); memcg_destroy_list_lru(lru); kfree(lru->node); lru->node = NULL; #ifdef CONFIG_MEMCG lru->shrinker_id = -1; #endif } EXPORT_SYMBOL_GPL(list_lru_destroy);
607 605 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0 /* * Wakeup statistics in sysfs * * Copyright (c) 2019 Linux Foundation * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2019 Google Inc. */ #include <linux/device.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include "power.h" static struct class *wakeup_class; #define wakeup_attr(_name) \ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wakeup_source *ws = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lu\n", ws->_name); \ } \ static DEVICE_ATTR_RO(_name) wakeup_attr(active_count); wakeup_attr(event_count); wakeup_attr(wakeup_count); wakeup_attr(expire_count); wakeup_attr(relax_count); static ssize_t active_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time = ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0; return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time)); } static DEVICE_ATTR_RO(active_time_ms); static ssize_t total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t total_time = ws->total_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); total_time = ktime_add(total_time, active_time); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time)); } static DEVICE_ATTR_RO(total_time_ms); static ssize_t max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t max_time = ws->max_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); if (active_time > max_time) max_time = active_time; } return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time)); } static DEVICE_ATTR_RO(max_time_ms); static ssize_t last_change_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time)); } static DEVICE_ATTR_RO(last_change_ms); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", ws->name); } static DEVICE_ATTR_RO(name); static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t prevent_sleep_time = ws->prevent_sleep_time; if (ws->active && ws->autosleep_enabled) { prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(ktime_get(), ws->start_prevent_time)); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time)); } static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { &dev_attr_name.attr, &dev_attr_active_count.attr, &dev_attr_event_count.attr, &dev_attr_wakeup_count.attr, &dev_attr_expire_count.attr, &dev_attr_relax_count.attr, &dev_attr_active_time_ms.attr, &dev_attr_total_time_ms.attr, &dev_attr_max_time_ms.attr, &dev_attr_last_change_ms.attr, &dev_attr_prevent_suspend_time_ms.attr, NULL, }; ATTRIBUTE_GROUPS(wakeup_source); static void device_create_release(struct device *dev) { kfree(dev); } static struct device *wakeup_source_device_create(struct device *parent, struct wakeup_source *ws) { struct device *dev = NULL; int retval; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } device_initialize(dev); dev->devt = MKDEV(0, 0); dev->class = wakeup_class; dev->parent = parent; dev->groups = wakeup_source_groups; dev->release = device_create_release; dev_set_drvdata(dev, ws); device_set_pm_not_required(dev); retval = dev_set_name(dev, "wakeup%d", ws->id); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } /** * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs. * @parent: Device given wakeup source is associated with (or NULL if virtual). * @ws: Wakeup source to be added in sysfs. */ int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws) { struct device *dev; dev = wakeup_source_device_create(parent, ws); if (IS_ERR(dev)) return PTR_ERR(dev); ws->dev = dev; return 0; } /** * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs * for a device if they're missing. * @parent: Device given wakeup source is associated with */ int pm_wakeup_source_sysfs_add(struct device *parent) { if (!parent->power.wakeup || parent->power.wakeup->dev) return 0; return wakeup_source_sysfs_add(parent, parent->power.wakeup); } /** * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs. * @ws: Wakeup source to be removed from sysfs. */ void wakeup_source_sysfs_remove(struct wakeup_source *ws) { device_unregister(ws->dev); } static int __init wakeup_sources_sysfs_init(void) { wakeup_class = class_create("wakeup"); return PTR_ERR_OR_ZERO(wakeup_class); } postcore_initcall(wakeup_sources_sysfs_init);
2 6 3 3 22 22 19 2 22 5 10 7 3 10 10 4 6 1 3 6 9 4 9 5 3 7 3 3 3 20 20 17 3 20 1 1 10 2 1 11 11 4 7 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,port type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ /* 7 skbinfo support added */ #define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port"); /* Type specific function prefix */ #define HTYPE hash_netport #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED /* IPv4 variant */ /* Member elements */ struct hash_netport4_elem { __be32 ip; __be16 port; u8 proto; u8 cidr:7; u8 nomatch:1; }; /* Common functions */ static bool hash_netport4_data_equal(const struct hash_netport4_elem *ip1, const struct hash_netport4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->port == ip2->port && ip1->proto == ip2->proto && ip1->cidr == ip2->cidr; } static int hash_netport4_do_data_match(const struct hash_netport4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); elem->cidr = cidr - 1; } static bool hash_netport4_data_list(struct sk_buff *skb, const struct hash_netport4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netport4_data_next(struct hash_netport4_elem *next, const struct hash_netport4_elem *d) { next->ip = d->ip; next->port = d->port; } #define MTYPE hash_netport4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_netport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = port_to = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port_to < port) swap(port, port_to); } if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip_to < ip) swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } if (retried) { ip = ntohl(h->next.ip); p = ntohs(h->next.port); } else { p = port; } do { e.ip = htonl(ip); ip = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; for (; p <= port_to; p++, i++) { e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_netport4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } p = port; } while (ip++ < ip_to); return ret; } /* IPv6 variant */ struct hash_netport6_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 cidr:7; u8 nomatch:1; }; /* Common functions */ static bool hash_netport6_data_equal(const struct hash_netport6_elem *ip1, const struct hash_netport6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto && ip1->cidr == ip2->cidr; } static int hash_netport6_do_data_match(const struct hash_netport6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); elem->cidr = cidr - 1; } static bool hash_netport6_data_list(struct sk_buff *skb, const struct hash_netport6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netport6_data_next(struct hash_netport6_elem *next, const struct hash_netport6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_netport6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netport6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6_netmask(&e.ip, e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netport6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } ip6_netmask(&e.ip, e.cidr + 1); e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_netport_type __read_mostly = { .name = "hash:net,port", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_netport_init(void) { return ip_set_type_register(&hash_netport_type); } static void __exit hash_netport_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_netport_type); } module_init(hash_netport_init); module_exit(hash_netport_fini);
759 605 605 216 216 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 // SPDX-License-Identifier: GPL-2.0 /* * Software nodes for the firmware node framework. * * Copyright (C) 2018, Intel Corporation * Author: Heikki Krogerus <heikki.krogerus@linux.intel.com> */ #include <linux/container_of.h> #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kobject.h> #include <linux/kstrtox.h> #include <linux/list.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/types.h> #include "base.h" struct swnode { struct kobject kobj; struct fwnode_handle fwnode; const struct software_node *node; int id; /* hierarchy */ struct ida child_ids; struct list_head entry; struct list_head children; struct swnode *parent; unsigned int allocated:1; unsigned int managed:1; }; static DEFINE_IDA(swnode_root_ids); static struct kset *swnode_kset; #define kobj_to_swnode(_kobj_) container_of(_kobj_, struct swnode, kobj) static const struct fwnode_operations software_node_ops; bool is_software_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &software_node_ops; } EXPORT_SYMBOL_GPL(is_software_node); #define to_swnode(__fwnode) \ ({ \ typeof(__fwnode) __to_swnode_fwnode = __fwnode; \ \ is_software_node(__to_swnode_fwnode) ? \ container_of(__to_swnode_fwnode, \ struct swnode, fwnode) : NULL; \ }) static inline struct swnode *dev_to_swnode(struct device *dev) { struct fwnode_handle *fwnode = dev_fwnode(dev); if (!fwnode) return NULL; if (!is_software_node(fwnode)) fwnode = fwnode->secondary; return to_swnode(fwnode); } static struct swnode * software_node_to_swnode(const struct software_node *node) { struct swnode *swnode = NULL; struct kobject *k; if (!node) return NULL; spin_lock(&swnode_kset->list_lock); list_for_each_entry(k, &swnode_kset->list, entry) { swnode = kobj_to_swnode(k); if (swnode->node == node) break; swnode = NULL; } spin_unlock(&swnode_kset->list_lock); return swnode; } const struct software_node *to_software_node(const struct fwnode_handle *fwnode) { const struct swnode *swnode = to_swnode(fwnode); return swnode ? swnode->node : NULL; } EXPORT_SYMBOL_GPL(to_software_node); struct fwnode_handle *software_node_fwnode(const struct software_node *node) { struct swnode *swnode = software_node_to_swnode(node); return swnode ? &swnode->fwnode : NULL; } EXPORT_SYMBOL_GPL(software_node_fwnode); /* -------------------------------------------------------------------------- */ /* property_entry processing */ static const struct property_entry * property_entry_get(const struct property_entry *prop, const char *name) { if (!prop) return NULL; for (; prop->name; prop++) if (!strcmp(name, prop->name)) return prop; return NULL; } static const void *property_get_pointer(const struct property_entry *prop) { if (!prop->length) return NULL; return prop->is_inline ? &prop->value : prop->pointer; } static const void *property_entry_find(const struct property_entry *props, const char *propname, size_t length) { const struct property_entry *prop; const void *pointer; prop = property_entry_get(props, propname); if (!prop) return ERR_PTR(-EINVAL); pointer = property_get_pointer(prop); if (!pointer) return ERR_PTR(-ENODATA); if (length > prop->length) return ERR_PTR(-EOVERFLOW); return pointer; } static int property_entry_count_elems_of_size(const struct property_entry *props, const char *propname, size_t length) { const struct property_entry *prop; prop = property_entry_get(props, propname); if (!prop) return -EINVAL; return prop->length / length; } static int property_entry_read_int_array(const struct property_entry *props, const char *name, unsigned int elem_size, void *val, size_t nval) { const void *pointer; size_t length; if (!val) return property_entry_count_elems_of_size(props, name, elem_size); if (!is_power_of_2(elem_size) || elem_size > sizeof(u64)) return -ENXIO; length = nval * elem_size; pointer = property_entry_find(props, name, length); if (IS_ERR(pointer)) return PTR_ERR(pointer); memcpy(val, pointer, length); return 0; } static int property_entry_read_string_array(const struct property_entry *props, const char *propname, const char **strings, size_t nval) { const void *pointer; size_t length; int array_len; /* Find out the array length. */ array_len = property_entry_count_elems_of_size(props, propname, sizeof(const char *)); if (array_len < 0) return array_len; /* Return how many there are if strings is NULL. */ if (!strings) return array_len; array_len = min_t(size_t, nval, array_len); length = array_len * sizeof(*strings); pointer = property_entry_find(props, propname, length); if (IS_ERR(pointer)) return PTR_ERR(pointer); memcpy(strings, pointer, length); return array_len; } static void property_entry_free_data(const struct property_entry *p) { const char * const *src_str; size_t i, nval; if (p->type == DEV_PROP_STRING) { src_str = property_get_pointer(p); nval = p->length / sizeof(*src_str); for (i = 0; i < nval; i++) kfree(src_str[i]); } if (!p->is_inline) kfree(p->pointer); kfree(p->name); } static bool property_copy_string_array(const char **dst_ptr, const char * const *src_ptr, size_t nval) { int i; for (i = 0; i < nval; i++) { dst_ptr[i] = kstrdup(src_ptr[i], GFP_KERNEL); if (!dst_ptr[i] && src_ptr[i]) { while (--i >= 0) kfree(dst_ptr[i]); return false; } } return true; } static int property_entry_copy_data(struct property_entry *dst, const struct property_entry *src) { const void *pointer = property_get_pointer(src); void *dst_ptr; size_t nval; /* * Properties with no data should not be marked as stored * out of line. */ if (!src->is_inline && !src->length) return -ENODATA; /* * Reference properties are never stored inline as * they are too big. */ if (src->type == DEV_PROP_REF && src->is_inline) return -EINVAL; if (src->length <= sizeof(dst->value)) { dst_ptr = &dst->value; dst->is_inline = true; } else { dst_ptr = kmalloc(src->length, GFP_KERNEL); if (!dst_ptr) return -ENOMEM; dst->pointer = dst_ptr; } if (src->type == DEV_PROP_STRING) { nval = src->length / sizeof(const char *); if (!property_copy_string_array(dst_ptr, pointer, nval)) { if (!dst->is_inline) kfree(dst->pointer); return -ENOMEM; } } else { memcpy(dst_ptr, pointer, src->length); } dst->length = src->length; dst->type = src->type; dst->name = kstrdup(src->name, GFP_KERNEL); if (!dst->name) { property_entry_free_data(dst); return -ENOMEM; } return 0; } /** * property_entries_dup - duplicate array of properties * @properties: array of properties to copy * * This function creates a deep copy of the given NULL-terminated array * of property entries. */ struct property_entry * property_entries_dup(const struct property_entry *properties) { struct property_entry *p; int i, n = 0; int ret; if (!properties) return NULL; while (properties[n].name) n++; p = kcalloc(n + 1, sizeof(*p), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); for (i = 0; i < n; i++) { ret = property_entry_copy_data(&p[i], &properties[i]); if (ret) { while (--i >= 0) property_entry_free_data(&p[i]); kfree(p); return ERR_PTR(ret); } } return p; } EXPORT_SYMBOL_GPL(property_entries_dup); /** * property_entries_free - free previously allocated array of properties * @properties: array of properties to destroy * * This function frees given NULL-terminated array of property entries, * along with their data. */ void property_entries_free(const struct property_entry *properties) { const struct property_entry *p; if (!properties) return; for (p = properties; p->name; p++) property_entry_free_data(p); kfree(properties); } EXPORT_SYMBOL_GPL(property_entries_free); /* -------------------------------------------------------------------------- */ /* fwnode operations */ static struct fwnode_handle *software_node_get(struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); kobject_get(&swnode->kobj); return &swnode->fwnode; } static void software_node_put(struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); kobject_put(&swnode->kobj); } static bool software_node_property_present(const struct fwnode_handle *fwnode, const char *propname) { struct swnode *swnode = to_swnode(fwnode); return !!property_entry_get(swnode->node->properties, propname); } static int software_node_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { struct swnode *swnode = to_swnode(fwnode); return property_entry_read_int_array(swnode->node->properties, propname, elem_size, val, nval); } static int software_node_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { struct swnode *swnode = to_swnode(fwnode); return property_entry_read_string_array(swnode->node->properties, propname, val, nval); } static const char * software_node_get_name(const struct fwnode_handle *fwnode) { const struct swnode *swnode = to_swnode(fwnode); return kobject_name(&swnode->kobj); } static const char * software_node_get_name_prefix(const struct fwnode_handle *fwnode) { struct fwnode_handle *parent; const char *prefix; parent = fwnode_get_parent(fwnode); if (!parent) return ""; /* Figure out the prefix from the parents. */ while (is_software_node(parent)) parent = fwnode_get_next_parent(parent); prefix = fwnode_get_name_prefix(parent); fwnode_handle_put(parent); /* Guess something if prefix was NULL. */ return prefix ?: "/"; } static struct fwnode_handle * software_node_get_parent(const struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); if (!swnode || !swnode->parent) return NULL; return fwnode_handle_get(&swnode->parent->fwnode); } static struct fwnode_handle * software_node_get_next_child(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct swnode *p = to_swnode(fwnode); struct swnode *c = to_swnode(child); if (!p || list_empty(&p->children) || (c && list_is_last(&c->entry, &p->children))) { fwnode_handle_put(child); return NULL; } if (c) c = list_next_entry(c, entry); else c = list_first_entry(&p->children, struct swnode, entry); fwnode_handle_put(child); return fwnode_handle_get(&c->fwnode); } static struct fwnode_handle * software_node_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { struct swnode *swnode = to_swnode(fwnode); struct swnode *child; if (!swnode || list_empty(&swnode->children)) return NULL; list_for_each_entry(child, &swnode->children, entry) { if (!strcmp(childname, kobject_name(&child->kobj))) { kobject_get(&child->kobj); return &child->fwnode; } } return NULL; } static int software_node_get_reference_args(const struct fwnode_handle *fwnode, const char *propname, const char *nargs_prop, unsigned int nargs, unsigned int index, struct fwnode_reference_args *args) { struct swnode *swnode = to_swnode(fwnode); const struct software_node_ref_args *ref_array; const struct software_node_ref_args *ref; const struct property_entry *prop; struct fwnode_handle *refnode; u32 nargs_prop_val; int error; int i; prop = property_entry_get(swnode->node->properties, propname); if (!prop) return -ENOENT; if (prop->type != DEV_PROP_REF) return -EINVAL; /* * We expect that references are never stored inline, even * single ones, as they are too big. */ if (prop->is_inline) return -EINVAL; if ((index + 1) * sizeof(*ref) > prop->length) return -ENOENT; ref_array = prop->pointer; ref = &ref_array[index]; refnode = software_node_fwnode(ref->node); if (!refnode) return -ENOENT; if (nargs_prop) { error = property_entry_read_int_array(ref->node->properties, nargs_prop, sizeof(u32), &nargs_prop_val, 1); if (error) return error; nargs = nargs_prop_val; } if (nargs > NR_FWNODE_REFERENCE_ARGS) return -EINVAL; if (!args) return 0; args->fwnode = software_node_get(refnode); args->nargs = nargs; for (i = 0; i < nargs; i++) args->args[i] = ref->args[i]; return 0; } static struct fwnode_handle * swnode_graph_find_next_port(const struct fwnode_handle *parent, struct fwnode_handle *port) { struct fwnode_handle *old = port; while ((port = software_node_get_next_child(parent, old))) { /* * fwnode ports have naming style "port@", so we search for any * children that follow that convention. */ if (!strncmp(to_swnode(port)->node->name, "port@", strlen("port@"))) return port; old = port; } return NULL; } static struct fwnode_handle * software_node_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *endpoint) { struct swnode *swnode = to_swnode(fwnode); struct fwnode_handle *parent; struct fwnode_handle *port; if (!swnode) return NULL; if (endpoint) { port = software_node_get_parent(endpoint); parent = software_node_get_parent(port); } else { parent = software_node_get_named_child_node(fwnode, "ports"); if (!parent) parent = software_node_get(&swnode->fwnode); port = swnode_graph_find_next_port(parent, NULL); } for (; port; port = swnode_graph_find_next_port(parent, port)) { endpoint = software_node_get_next_child(port, endpoint); if (endpoint) { fwnode_handle_put(port); break; } } fwnode_handle_put(parent); return endpoint; } static struct fwnode_handle * software_node_graph_get_remote_endpoint(const struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); const struct software_node_ref_args *ref; const struct property_entry *prop; if (!swnode) return NULL; prop = property_entry_get(swnode->node->properties, "remote-endpoint"); if (!prop || prop->type != DEV_PROP_REF || prop->is_inline) return NULL; ref = prop->pointer; return software_node_get(software_node_fwnode(ref[0].node)); } static struct fwnode_handle * software_node_graph_get_port_parent(struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); swnode = swnode->parent; if (swnode && !strcmp(swnode->node->name, "ports")) swnode = swnode->parent; return swnode ? software_node_get(&swnode->fwnode) : NULL; } static int software_node_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { struct swnode *swnode = to_swnode(fwnode); const char *parent_name = swnode->parent->node->name; int ret; if (strlen("port@") >= strlen(parent_name) || strncmp(parent_name, "port@", strlen("port@"))) return -EINVAL; /* Ports have naming style "port@n", we need to select the n */ ret = kstrtou32(parent_name + strlen("port@"), 10, &endpoint->port); if (ret) return ret; endpoint->id = swnode->id; endpoint->local_fwnode = fwnode; return 0; } static const struct fwnode_operations software_node_ops = { .get = software_node_get, .put = software_node_put, .property_present = software_node_property_present, .property_read_bool = software_node_property_present, .property_read_int_array = software_node_read_int_array, .property_read_string_array = software_node_read_string_array, .get_name = software_node_get_name, .get_name_prefix = software_node_get_name_prefix, .get_parent = software_node_get_parent, .get_next_child_node = software_node_get_next_child, .get_named_child_node = software_node_get_named_child_node, .get_reference_args = software_node_get_reference_args, .graph_get_next_endpoint = software_node_graph_get_next_endpoint, .graph_get_remote_endpoint = software_node_graph_get_remote_endpoint, .graph_get_port_parent = software_node_graph_get_port_parent, .graph_parse_endpoint = software_node_graph_parse_endpoint, }; /* -------------------------------------------------------------------------- */ /** * software_node_find_by_name - Find software node by name * @parent: Parent of the software node * @name: Name of the software node * * The function will find a node that is child of @parent and that is named * @name. If no node is found, the function returns NULL. * * NOTE: you will need to drop the reference with fwnode_handle_put() after use. */ const struct software_node * software_node_find_by_name(const struct software_node *parent, const char *name) { struct swnode *swnode = NULL; struct kobject *k; if (!name) return NULL; spin_lock(&swnode_kset->list_lock); list_for_each_entry(k, &swnode_kset->list, entry) { swnode = kobj_to_swnode(k); if (parent == swnode->node->parent && swnode->node->name && !strcmp(name, swnode->node->name)) { kobject_get(&swnode->kobj); break; } swnode = NULL; } spin_unlock(&swnode_kset->list_lock); return swnode ? swnode->node : NULL; } EXPORT_SYMBOL_GPL(software_node_find_by_name); static struct software_node *software_node_alloc(const struct property_entry *properties) { struct property_entry *props; struct software_node *node; props = property_entries_dup(properties); if (IS_ERR(props)) return ERR_CAST(props); node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) { property_entries_free(props); return ERR_PTR(-ENOMEM); } node->properties = props; return node; } static void software_node_free(const struct software_node *node) { property_entries_free(node->properties); kfree(node); } static void software_node_release(struct kobject *kobj) { struct swnode *swnode = kobj_to_swnode(kobj); if (swnode->parent) { ida_free(&swnode->parent->child_ids, swnode->id); list_del(&swnode->entry); } else { ida_free(&swnode_root_ids, swnode->id); } if (swnode->allocated) software_node_free(swnode->node); ida_destroy(&swnode->child_ids); kfree(swnode); } static const struct kobj_type software_node_type = { .release = software_node_release, .sysfs_ops = &kobj_sysfs_ops, }; static struct fwnode_handle * swnode_register(const struct software_node *node, struct swnode *parent, unsigned int allocated) { struct swnode *swnode; int ret; swnode = kzalloc(sizeof(*swnode), GFP_KERNEL); if (!swnode) return ERR_PTR(-ENOMEM); ret = ida_alloc(parent ? &parent->child_ids : &swnode_root_ids, GFP_KERNEL); if (ret < 0) { kfree(swnode); return ERR_PTR(ret); } swnode->id = ret; swnode->node = node; swnode->parent = parent; swnode->kobj.kset = swnode_kset; fwnode_init(&swnode->fwnode, &software_node_ops); ida_init(&swnode->child_ids); INIT_LIST_HEAD(&swnode->entry); INIT_LIST_HEAD(&swnode->children); if (node->name) ret = kobject_init_and_add(&swnode->kobj, &software_node_type, parent ? &parent->kobj : NULL, "%s", node->name); else ret = kobject_init_and_add(&swnode->kobj, &software_node_type, parent ? &parent->kobj : NULL, "node%d", swnode->id); if (ret) { kobject_put(&swnode->kobj); return ERR_PTR(ret); } /* * Assign the flag only in the successful case, so * the above kobject_put() won't mess up with properties. */ swnode->allocated = allocated; if (parent) list_add_tail(&swnode->entry, &parent->children); kobject_uevent(&swnode->kobj, KOBJ_ADD); return &swnode->fwnode; } /** * software_node_register_node_group - Register a group of software nodes * @node_group: NULL terminated array of software node pointers to be registered * * Register multiple software nodes at once. If any node in the array * has its .parent pointer set (which can only be to another software_node), * then its parent **must** have been registered before it is; either outside * of this function or by ordering the array such that parent comes before * child. */ int software_node_register_node_group(const struct software_node * const *node_group) { unsigned int i; int ret; if (!node_group) return 0; for (i = 0; node_group[i]; i++) { ret = software_node_register(node_group[i]); if (ret) { software_node_unregister_node_group(node_group); return ret; } } return 0; } EXPORT_SYMBOL_GPL(software_node_register_node_group); /** * software_node_unregister_node_group - Unregister a group of software nodes * @node_group: NULL terminated array of software node pointers to be unregistered * * Unregister multiple software nodes at once. If parent pointers are set up * in any of the software nodes then the array **must** be ordered such that * parents come before their children. * * NOTE: If you are uncertain whether the array is ordered such that * parents will be unregistered before their children, it is wiser to * remove the nodes individually, in the correct order (child before * parent). */ void software_node_unregister_node_group(const struct software_node * const *node_group) { unsigned int i = 0; if (!node_group) return; while (node_group[i]) i++; while (i--) software_node_unregister(node_group[i]); } EXPORT_SYMBOL_GPL(software_node_unregister_node_group); /** * software_node_register - Register static software node * @node: The software node to be registered */ int software_node_register(const struct software_node *node) { struct swnode *parent = software_node_to_swnode(node->parent); if (software_node_to_swnode(node)) return -EEXIST; if (node->parent && !parent) return -EINVAL; return PTR_ERR_OR_ZERO(swnode_register(node, parent, 0)); } EXPORT_SYMBOL_GPL(software_node_register); /** * software_node_unregister - Unregister static software node * @node: The software node to be unregistered */ void software_node_unregister(const struct software_node *node) { struct swnode *swnode; swnode = software_node_to_swnode(node); if (swnode) fwnode_remove_software_node(&swnode->fwnode); } EXPORT_SYMBOL_GPL(software_node_unregister); struct fwnode_handle * fwnode_create_software_node(const struct property_entry *properties, const struct fwnode_handle *parent) { struct fwnode_handle *fwnode; struct software_node *node; struct swnode *p; if (IS_ERR(parent)) return ERR_CAST(parent); p = to_swnode(parent); if (parent && !p) return ERR_PTR(-EINVAL); node = software_node_alloc(properties); if (IS_ERR(node)) return ERR_CAST(node); node->parent = p ? p->node : NULL; fwnode = swnode_register(node, p, 1); if (IS_ERR(fwnode)) software_node_free(node); return fwnode; } EXPORT_SYMBOL_GPL(fwnode_create_software_node); void fwnode_remove_software_node(struct fwnode_handle *fwnode) { struct swnode *swnode = to_swnode(fwnode); if (!swnode) return; kobject_put(&swnode->kobj); } EXPORT_SYMBOL_GPL(fwnode_remove_software_node); /** * device_add_software_node - Assign software node to a device * @dev: The device the software node is meant for. * @node: The software node. * * This function will make @node the secondary firmware node pointer of @dev. If * @dev has no primary node, then @node will become the primary node. The * function will register @node automatically if it wasn't already registered. */ int device_add_software_node(struct device *dev, const struct software_node *node) { struct swnode *swnode; int ret; /* Only one software node per device. */ if (dev_to_swnode(dev)) return -EBUSY; swnode = software_node_to_swnode(node); if (swnode) { kobject_get(&swnode->kobj); } else { ret = software_node_register(node); if (ret) return ret; swnode = software_node_to_swnode(node); } set_secondary_fwnode(dev, &swnode->fwnode); /* * If the device has been fully registered by the time this function is * called, software_node_notify() must be called separately so that the * symlinks get created and the reference count of the node is kept in * balance. */ if (device_is_registered(dev)) software_node_notify(dev); return 0; } EXPORT_SYMBOL_GPL(device_add_software_node); /** * device_remove_software_node - Remove device's software node * @dev: The device with the software node. * * This function will unregister the software node of @dev. */ void device_remove_software_node(struct device *dev) { struct swnode *swnode; swnode = dev_to_swnode(dev); if (!swnode) return; if (device_is_registered(dev)) software_node_notify_remove(dev); set_secondary_fwnode(dev, NULL); kobject_put(&swnode->kobj); } EXPORT_SYMBOL_GPL(device_remove_software_node); /** * device_create_managed_software_node - Create a software node for a device * @dev: The device the software node is assigned to. * @properties: Device properties for the software node. * @parent: Parent of the software node. * * Creates a software node as a managed resource for @dev, which means the * lifetime of the newly created software node is tied to the lifetime of @dev. * Software nodes created with this function should not be reused or shared * because of that. The function takes a deep copy of @properties for the * software node. * * Since the new software node is assigned directly to @dev, and since it should * not be shared, it is not returned to the caller. The function returns 0 on * success, and errno in case of an error. */ int device_create_managed_software_node(struct device *dev, const struct property_entry *properties, const struct software_node *parent) { struct fwnode_handle *p = software_node_fwnode(parent); struct fwnode_handle *fwnode; if (parent && !p) return -EINVAL; fwnode = fwnode_create_software_node(properties, p); if (IS_ERR(fwnode)) return PTR_ERR(fwnode); to_swnode(fwnode)->managed = true; set_secondary_fwnode(dev, fwnode); if (device_is_registered(dev)) software_node_notify(dev); return 0; } EXPORT_SYMBOL_GPL(device_create_managed_software_node); void software_node_notify(struct device *dev) { struct swnode *swnode; int ret; swnode = dev_to_swnode(dev); if (!swnode) return; kobject_get(&swnode->kobj); ret = sysfs_create_link(&dev->kobj, &swnode->kobj, "software_node"); if (ret) return; ret = sysfs_create_link(&swnode->kobj, &dev->kobj, dev_name(dev)); if (ret) { sysfs_remove_link(&dev->kobj, "software_node"); return; } } void software_node_notify_remove(struct device *dev) { struct swnode *swnode; swnode = dev_to_swnode(dev); if (!swnode) return; sysfs_remove_link(&swnode->kobj, dev_name(dev)); sysfs_remove_link(&dev->kobj, "software_node"); kobject_put(&swnode->kobj); if (swnode->managed) { set_secondary_fwnode(dev, NULL); kobject_put(&swnode->kobj); } } static int __init software_node_init(void) { swnode_kset = kset_create_and_add("software_nodes", NULL, kernel_kobj); if (!swnode_kset) return -ENOMEM; return 0; } postcore_initcall(software_node_init); static void __exit software_node_exit(void) { ida_destroy(&swnode_root_ids); kset_unregister(swnode_kset); } __exitcall(software_node_exit);
3 259 12 12 12 3 3 27 7 7 7 3 3 3 3 3 3 3 3 3 34 34 20 4 12 30 41 41 2 1 4 1 34 35 1 1 13 13 13 13 11 11 11 11 11 2 2 2 2 39 9 4 28 28 1 27 6 21 7 7 7 6 6 10 1 9 5 5 9 9 7 7 7 7 11 2 10 9 2 1 2 12 202 203 203 202 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 /* * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/bug.h> #include <linux/kdev_t.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/proc_ns.h> #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> #include <net/netdev_lock.h> #include <net/xdp.h> /* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { const struct bpf_prog_offload_ops *ops; struct list_head netdevs; void *priv; }; struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; struct bpf_offload_dev *offdev; /* NULL when bound-only */ struct list_head progs; struct list_head maps; struct list_head offdev_netdevs; }; static const struct rhashtable_params offdevs_params = { .nelem_hint = 4, .key_len = sizeof(struct net_device *), .key_offset = offsetof(struct bpf_offload_netdev, netdev), .head_offset = offsetof(struct bpf_offload_netdev, l), .automatic_shrinking = true, }; static struct rhashtable offdevs; static int bpf_dev_offload_check(struct net_device *netdev) { if (!netdev) return -EINVAL; if (!netdev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; return 0; } static struct bpf_offload_netdev * bpf_offload_find_netdev(struct net_device *netdev) { lockdep_assert_held(&bpf_devs_lock); return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); } static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev) { struct bpf_offload_netdev *ondev; int err; ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); if (!ondev) return -ENOMEM; ondev->netdev = netdev; ondev->offdev = offdev; INIT_LIST_HEAD(&ondev->progs); INIT_LIST_HEAD(&ondev->maps); err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); if (err) { netdev_warn(netdev, "failed to register for BPF offload\n"); goto err_free; } if (offdev) list_add(&ondev->offdev_netdevs, &offdev->netdevs); return 0; err_free: kfree(ondev); return err; } static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; if (offload->dev_state) offload->offdev->ops->destroy(prog); list_del_init(&offload->offloads); kfree(offload); prog->aux->offload = NULL; } static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, enum bpf_netdev_command cmd) { struct netdev_bpf data = {}; struct net_device *netdev; ASSERT_RTNL(); data.command = cmd; data.offmap = offmap; /* Caller must make sure netdev is valid */ netdev = offmap->netdev; return netdev->netdev_ops->ndo_bpf(netdev, &data); } static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) { WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ bpf_map_free_id(&offmap->map); list_del_init(&offmap->offloads); offmap->netdev = NULL; } static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev) { struct bpf_offload_netdev *ondev, *altdev = NULL; struct bpf_offloaded_map *offmap, *mtmp; struct bpf_prog_offload *offload, *ptmp; ASSERT_RTNL(); ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); if (WARN_ON(!ondev)) return; WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); /* Try to move the objects to another netdev of the device */ if (offdev) { list_del(&ondev->offdev_netdevs); altdev = list_first_entry_or_null(&offdev->netdevs, struct bpf_offload_netdev, offdev_netdevs); } if (altdev) { list_for_each_entry(offload, &ondev->progs, offloads) offload->netdev = altdev->netdev; list_splice_init(&ondev->progs, &altdev->progs); list_for_each_entry(offmap, &ondev->maps, offloads) offmap->netdev = altdev->netdev; list_splice_init(&ondev->maps, &altdev->maps); } else { list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) __bpf_prog_offload_destroy(offload->prog); list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) __bpf_map_offload_destroy(offmap); } WARN_ON(!list_empty(&ondev->progs)); WARN_ON(!list_empty(&ondev->maps)); kfree(ondev); } static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev) { struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; offload = kzalloc(sizeof(*offload), GFP_USER); if (!offload) return -ENOMEM; offload->prog = prog; offload->netdev = netdev; ondev = bpf_offload_find_netdev(offload->netdev); /* When program is offloaded require presence of "true" * bpf_offload_netdev, avoid the one created for !ondev case below. */ if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) { err = -EINVAL; goto err_free; } if (!ondev) { /* When only binding to the device, explicitly * create an entry in the hashtable. */ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev); if (err) goto err_free; ondev = bpf_offload_find_netdev(offload->netdev); } offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); return 0; err_free: kfree(offload); return err; } int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) { struct net_device *netdev; int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) return -EINVAL; if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS)) return -EINVAL; /* Frags are allowed only if program is dev-bound-only, but not * if it is requesting bpf offload. */ if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS && !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY) return -EINVAL; netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); if (!netdev) return -EINVAL; err = bpf_dev_offload_check(netdev); if (err) goto out; prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY); down_write(&bpf_devs_lock); err = __bpf_prog_dev_bound_init(prog, netdev); up_write(&bpf_devs_lock); out: dev_put(netdev); return err; } int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) { int err; if (!bpf_prog_is_dev_bound(old_prog->aux)) return 0; if (bpf_prog_is_offloaded(old_prog->aux)) return -EINVAL; new_prog->aux->dev_bound = old_prog->aux->dev_bound; new_prog->aux->offload_requested = old_prog->aux->offload_requested; down_write(&bpf_devs_lock); if (!old_prog->aux->offload) { err = -EINVAL; goto out; } err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev); out: up_write(&bpf_devs_lock); return err; } int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) { ret = offload->offdev->ops->prepare(prog); offload->dev_state = !ret; } up_read(&bpf_devs_lock); return ret; } int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) ret = offload->offdev->ops->insn_hook(env, insn_idx, prev_insn_idx); up_read(&bpf_devs_lock); return ret; } int bpf_prog_offload_finalize(struct bpf_verifier_env *env) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { if (offload->offdev->ops->finalize) ret = offload->offdev->ops->finalize(env); else ret = 0; } up_read(&bpf_devs_lock); return ret; } void bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn) { const struct bpf_prog_offload_ops *ops; struct bpf_prog_offload *offload; int ret = -EOPNOTSUPP; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { ops = offload->offdev->ops; if (!offload->opt_failed && ops->replace_insn) ret = ops->replace_insn(env, off, insn); offload->opt_failed |= ret; } up_read(&bpf_devs_lock); } void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) { struct bpf_prog_offload *offload; int ret = -EOPNOTSUPP; down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { if (!offload->opt_failed && offload->offdev->ops->remove_insns) ret = offload->offdev->ops->remove_insns(env, off, cnt); offload->opt_failed |= ret; } up_read(&bpf_devs_lock); } void bpf_prog_dev_bound_destroy(struct bpf_prog *prog) { struct bpf_offload_netdev *ondev; struct net_device *netdev; rtnl_lock(); down_write(&bpf_devs_lock); if (prog->aux->offload) { list_del_init(&prog->aux->offload->offloads); netdev = prog->aux->offload->netdev; __bpf_prog_offload_destroy(prog); ondev = bpf_offload_find_netdev(netdev); if (!ondev->offdev && list_empty(&ondev->progs)) __bpf_offload_dev_netdev_unregister(NULL, netdev); } up_write(&bpf_devs_lock); rtnl_unlock(); } static int bpf_prog_offload_translate(struct bpf_prog *prog) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) ret = offload->offdev->ops->translate(prog); up_read(&bpf_devs_lock); return ret; } static unsigned int bpf_prog_warn_on_exec(const void *ctx, const struct bpf_insn *insn) { WARN(1, "attempt to execute device eBPF program on the host!"); return 0; } int bpf_prog_offload_compile(struct bpf_prog *prog) { prog->bpf_func = bpf_prog_warn_on_exec; return bpf_prog_offload_translate(prog); } struct ns_get_path_bpf_prog_args { struct bpf_prog *prog; struct bpf_prog_info *info; }; static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) { struct ns_get_path_bpf_prog_args *args = private_data; struct bpf_prog_aux *aux = args->prog->aux; struct ns_common *ns; struct net *net; rtnl_lock(); down_read(&bpf_devs_lock); if (aux->offload) { args->info->ifindex = aux->offload->netdev->ifindex; net = dev_net(aux->offload->netdev); get_net(net); ns = &net->ns; } else { args->info->ifindex = 0; ns = NULL; } up_read(&bpf_devs_lock); rtnl_unlock(); return ns; } int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog) { struct ns_get_path_bpf_prog_args args = { .prog = prog, .info = info, }; struct bpf_prog_aux *aux = prog->aux; struct inode *ns_inode; struct path ns_path; char __user *uinsns; int res; u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); if (res) { if (!info->ifindex) return -ENODEV; return res; } down_read(&bpf_devs_lock); if (!aux->offload) { up_read(&bpf_devs_lock); return -ENODEV; } ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { up_read(&bpf_devs_lock); return -EFAULT; } } up_read(&bpf_devs_lock); ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; path_put(&ns_path); return 0; } const struct bpf_prog_ops bpf_offload_prog_ops = { }; struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (attr->map_type != BPF_MAP_TYPE_ARRAY && attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE); if (!offmap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&offmap->map, attr); rtnl_lock(); offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); err = bpf_dev_offload_check(offmap->netdev); if (err) goto err_unlock_rtnl; netdev_lock_ops(offmap->netdev); down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offmap->netdev); if (!ondev) { err = -EINVAL; goto err_unlock; } err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); netdev_unlock_ops(offmap->netdev); rtnl_unlock(); return &offmap->map; err_unlock: up_write(&bpf_devs_lock); netdev_unlock_ops(offmap->netdev); err_unlock_rtnl: rtnl_unlock(); bpf_map_area_free(offmap); return ERR_PTR(err); } void bpf_map_offload_map_free(struct bpf_map *map) { struct bpf_offloaded_map *offmap = map_to_offmap(map); rtnl_lock(); down_write(&bpf_devs_lock); if (offmap->netdev) __bpf_map_offload_destroy(offmap); up_write(&bpf_devs_lock); rtnl_unlock(); bpf_map_area_free(offmap); } u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) { /* The memory dynamically allocated in netdev dev_ops is not counted */ return sizeof(struct bpf_offloaded_map); } int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; if (unlikely(flags > BPF_EXIST)) return -EINVAL; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_update_elem(offmap, key, value, flags); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_delete_elem(offmap, key); up_read(&bpf_devs_lock); return ret; } int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_offloaded_map *offmap = map_to_offmap(map); int ret = -ENODEV; down_read(&bpf_devs_lock); if (offmap->netdev) ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); up_read(&bpf_devs_lock); return ret; } struct ns_get_path_bpf_map_args { struct bpf_offloaded_map *offmap; struct bpf_map_info *info; }; static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) { struct ns_get_path_bpf_map_args *args = private_data; struct ns_common *ns; struct net *net; rtnl_lock(); down_read(&bpf_devs_lock); if (args->offmap->netdev) { args->info->ifindex = args->offmap->netdev->ifindex; net = dev_net(args->offmap->netdev); get_net(net); ns = &net->ns; } else { args->info->ifindex = 0; ns = NULL; } up_read(&bpf_devs_lock); rtnl_unlock(); return ns; } int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct ns_get_path_bpf_map_args args = { .offmap = map_to_offmap(map), .info = info, }; struct inode *ns_inode; struct path ns_path; int res; res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); if (res) { if (!info->ifindex) return -ENODEV; return res; } ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; path_put(&ns_path); return 0; } static bool __bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) { struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; if (!bpf_prog_is_dev_bound(prog->aux)) return false; offload = prog->aux->offload; if (!offload) return false; if (offload->netdev == netdev) return true; ondev1 = bpf_offload_find_netdev(offload->netdev); ondev2 = bpf_offload_find_netdev(netdev); return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; } bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) { bool ret; down_read(&bpf_devs_lock); ret = __bpf_offload_dev_match(prog, netdev); up_read(&bpf_devs_lock); return ret; } EXPORT_SYMBOL_GPL(bpf_offload_dev_match); bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) { bool ret; if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux)) return false; down_read(&bpf_devs_lock); ret = lhs->aux->offload && rhs->aux->offload && lhs->aux->offload->netdev && lhs->aux->offload->netdev == rhs->aux->offload->netdev; up_read(&bpf_devs_lock); return ret; } bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; bool ret; if (!bpf_map_is_offloaded(map)) return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); down_read(&bpf_devs_lock); ret = __bpf_offload_dev_match(prog, offmap->netdev); up_read(&bpf_devs_lock); return ret; } int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev) { int err; down_write(&bpf_devs_lock); err = __bpf_offload_dev_netdev_register(offdev, netdev); up_write(&bpf_devs_lock); return err; } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev) { down_write(&bpf_devs_lock); __bpf_offload_dev_netdev_unregister(offdev, netdev); up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); if (!offdev) return ERR_PTR(-ENOMEM); offdev->ops = ops; offdev->priv = priv; INIT_LIST_HEAD(&offdev->netdevs); return offdev; } EXPORT_SYMBOL_GPL(bpf_offload_dev_create); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) { WARN_ON(!list_empty(&offdev->netdevs)); kfree(offdev); } EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) { return offdev->priv; } EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); void bpf_dev_bound_netdev_unregister(struct net_device *dev) { struct bpf_offload_netdev *ondev; ASSERT_RTNL(); down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(dev); if (ondev && !ondev->offdev) __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev); up_write(&bpf_devs_lock); } int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux) { if (!bpf_prog_is_dev_bound(prog_aux)) { bpf_log(log, "metadata kfuncs require device-bound program\n"); return -EINVAL; } if (bpf_prog_is_offloaded(prog_aux)) { bpf_log(log, "metadata kfuncs can't be offloaded\n"); return -EINVAL; } return 0; } void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) { const struct xdp_metadata_ops *ops; void *p = NULL; /* We don't hold bpf_devs_lock while resolving several * kfuncs and can race with the unregister_netdevice(). * We rely on bpf_dev_bound_match() check at attach * to render this program unusable. */ down_read(&bpf_devs_lock); if (!prog->aux->offload) goto out; ops = prog->aux->offload->netdev->xdp_metadata_ops; if (!ops) goto out; #define XDP_METADATA_KFUNC(name, _, __, xmo) \ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC out: up_read(&bpf_devs_lock); return p; } static int __init bpf_offload_init(void) { return rhashtable_init(&offdevs, &offdevs_params); } core_initcall(bpf_offload_init);
134 17 124 115 75 67 62 68 2 109 12 99 98 137 137 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-or-later /* * Create default crypto algorithm instances. * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <linux/completion.h> #include <linux/ctype.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" struct cryptomgr_param { struct rtattr *tb[CRYPTO_MAX_ATTRS + 2]; struct { struct rtattr attr; struct crypto_attr_type data; } type; struct { struct rtattr attr; struct crypto_attr_alg data; } attrs[CRYPTO_MAX_ATTRS]; char template[CRYPTO_MAX_ALG_NAME]; struct crypto_larval *larval; u32 otype; u32 omask; }; struct crypto_test_param { char driver[CRYPTO_MAX_ALG_NAME]; char alg[CRYPTO_MAX_ALG_NAME]; u32 type; }; static int cryptomgr_probe(void *data) { struct cryptomgr_param *param = data; struct crypto_template *tmpl; int err = -ENOENT; tmpl = crypto_lookup_template(param->template); if (!tmpl) goto out; do { err = tmpl->create(tmpl, param->tb); } while (err == -EAGAIN && !signal_pending(current)); crypto_tmpl_put(tmpl); out: param->larval->adult = ERR_PTR(err); param->larval->alg.cra_flags |= CRYPTO_ALG_DEAD; complete_all(&param->larval->completion); crypto_alg_put(&param->larval->alg); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_probe(struct crypto_larval *larval) { struct task_struct *thread; struct cryptomgr_param *param; const char *name = larval->alg.cra_name; const char *p; unsigned int len; int i; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; for (p = name; isalnum(*p) || *p == '-' || *p == '_'; p++) ; len = p - name; if (!len || *p != '(') goto err_free_param; memcpy(param->template, name, len); i = 0; for (;;) { name = ++p; for (; isalnum(*p) || *p == '-' || *p == '_'; p++) ; if (*p == '(') { int recursion = 0; for (;;) { if (!*++p) goto err_free_param; if (*p == '(') recursion++; else if (*p == ')' && !recursion--) break; } p++; } len = p - name; if (!len) goto err_free_param; param->attrs[i].attr.rta_len = sizeof(param->attrs[i]); param->attrs[i].attr.rta_type = CRYPTOA_ALG; memcpy(param->attrs[i].data.name, name, len); param->tb[i + 1] = &param->attrs[i].attr; i++; if (i >= CRYPTO_MAX_ATTRS) goto err_free_param; if (*p == ')') break; if (*p != ',') goto err_free_param; } param->tb[i + 1] = NULL; param->type.attr.rta_len = sizeof(param->type); param->type.attr.rta_type = CRYPTOA_TYPE; param->type.data.type = larval->alg.cra_flags & ~CRYPTO_ALG_TESTED; param->type.data.mask = larval->mask & ~CRYPTO_ALG_TESTED; param->tb[0] = &param->type.attr; param->otype = larval->alg.cra_flags; param->omask = larval->mask; crypto_alg_get(&larval->alg); param->larval = larval; thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe"); if (IS_ERR(thread)) goto err_put_larval; return NOTIFY_STOP; err_put_larval: crypto_alg_put(&larval->alg); err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_test(void *data) { struct crypto_test_param *param = data; u32 type = param->type; int err; err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); crypto_alg_tested(param->driver, err); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_test(struct crypto_alg *alg) { struct task_struct *thread; struct crypto_test_param *param; if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS)) return NOTIFY_DONE; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver)); memcpy(param->alg, alg->cra_name, sizeof(param->alg)); param->type = alg->cra_flags; thread = kthread_run(cryptomgr_test, param, "cryptomgr_test"); if (IS_ERR(thread)) goto err_free_param; return NOTIFY_STOP; err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_notify(struct notifier_block *this, unsigned long msg, void *data) { switch (msg) { case CRYPTO_MSG_ALG_REQUEST: return cryptomgr_schedule_probe(data); case CRYPTO_MSG_ALG_REGISTER: return cryptomgr_schedule_test(data); case CRYPTO_MSG_ALG_LOADED: break; } return NOTIFY_DONE; } static struct notifier_block cryptomgr_notifier = { .notifier_call = cryptomgr_notify, }; static int __init cryptomgr_init(void) { return crypto_register_notifier(&cryptomgr_notifier); } static void __exit cryptomgr_exit(void) { int err = crypto_unregister_notifier(&cryptomgr_notifier); BUG_ON(err); } module_init(cryptomgr_init); module_exit(cryptomgr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto Algorithm Manager");
20 6 14 23 23 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0-or-later /* * CMAC: Cipher Block Mode for Authentication * * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * Based on work by: * Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com> * Based on crypto/xcbc.c: * Copyright © 2006 USAGI/WIDE Project, * Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org> */ #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <crypto/utils.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> /* * +------------------------ * | <parent tfm> * +------------------------ * | cmac_tfm_ctx * +------------------------ * | consts (block size * 2) * +------------------------ */ struct cmac_tfm_ctx { struct crypto_cipher *child; __be64 consts[]; }; static int crypto_cmac_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent); unsigned int bs = crypto_shash_blocksize(parent); __be64 *consts = ctx->consts; u64 _const[2]; int i, err = 0; u8 msb_mask, gfmask; err = crypto_cipher_setkey(ctx->child, inkey, keylen); if (err) return err; /* encrypt the zero block */ memset(consts, 0, bs); crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts); switch (bs) { case 16: gfmask = 0x87; _const[0] = be64_to_cpu(consts[1]); _const[1] = be64_to_cpu(consts[0]); /* gf(2^128) multiply zero-ciphertext with u and u^2 */ for (i = 0; i < 4; i += 2) { msb_mask = ((s64)_const[1] >> 63) & gfmask; _const[1] = (_const[1] << 1) | (_const[0] >> 63); _const[0] = (_const[0] << 1) ^ msb_mask; consts[i + 0] = cpu_to_be64(_const[1]); consts[i + 1] = cpu_to_be64(_const[0]); } break; case 8: gfmask = 0x1B; _const[0] = be64_to_cpu(consts[0]); /* gf(2^64) multiply zero-ciphertext with u and u^2 */ for (i = 0; i < 2; i++) { msb_mask = ((s64)_const[0] >> 63) & gfmask; _const[0] = (_const[0] << 1) ^ msb_mask; consts[i] = cpu_to_be64(_const[0]); } break; } return 0; } static int crypto_cmac_digest_init(struct shash_desc *pdesc) { int bs = crypto_shash_blocksize(pdesc->tfm); u8 *prev = shash_desc_ctx(pdesc); memset(prev, 0, bs); return 0; } static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *prev = shash_desc_ctx(pdesc); do { crypto_xor(prev, p, bs); crypto_cipher_encrypt_one(tfm, prev, prev); p += bs; len -= bs; } while (len >= bs); return len; } static int crypto_cmac_digest_finup(struct shash_desc *pdesc, const u8 *src, unsigned int len, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *prev = shash_desc_ctx(pdesc); unsigned int offset = 0; crypto_xor(prev, src, len); if (len != bs) { prev[len] ^= 0x80; offset += bs; } crypto_xor(prev, (const u8 *)tctx->consts + offset, bs); crypto_cipher_encrypt_one(tfm, out, prev); return 0; } static int cmac_init_tfm(struct crypto_shash *tfm) { struct shash_instance *inst = shash_alg_instance(tfm); struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); struct crypto_cipher_spawn *spawn; struct crypto_cipher *cipher; spawn = shash_instance_ctx(inst); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; } static int cmac_clone_tfm(struct crypto_shash *tfm, struct crypto_shash *otfm) { struct cmac_tfm_ctx *octx = crypto_shash_ctx(otfm); struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); struct crypto_cipher *cipher; cipher = crypto_clone_cipher(octx->child); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; } static void cmac_exit_tfm(struct crypto_shash *tfm) { struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); crypto_free_cipher(ctx->child); } static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); switch (alg->cra_blocksize) { case 16: case 8: break; default: err = -EINVAL; goto err_free_inst; } err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; inst->alg.base.cra_ctxsize = sizeof(struct cmac_tfm_ctx) + alg->cra_blocksize * 2; inst->alg.base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | CRYPTO_AHASH_ALG_FINAL_NONZERO; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = alg->cra_blocksize; inst->alg.init = crypto_cmac_digest_init; inst->alg.update = crypto_cmac_digest_update; inst->alg.finup = crypto_cmac_digest_finup; inst->alg.setkey = crypto_cmac_digest_setkey; inst->alg.init_tfm = cmac_init_tfm; inst->alg.clone_tfm = cmac_clone_tfm; inst->alg.exit_tfm = cmac_exit_tfm; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_cmac_tmpl = { .name = "cmac", .create = cmac_create, .module = THIS_MODULE, }; static int __init crypto_cmac_module_init(void) { return crypto_register_template(&crypto_cmac_tmpl); } static void __exit crypto_cmac_module_exit(void) { crypto_unregister_template(&crypto_cmac_tmpl); } module_init(crypto_cmac_module_init); module_exit(crypto_cmac_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CMAC keyed hash algorithm"); MODULE_ALIAS_CRYPTO("cmac"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
42 42 42 42 42 75 75 75 7 7 7 7 7 10 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 // SPDX-License-Identifier: GPL-2.0-only /* * BSS client mode implementation * Copyright 2003-2008, Jouni Malinen <j@w1.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2025 Intel Corporation */ #include <linux/delay.h> #include <linux/fips.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/moduleparam.h> #include <linux/rtnetlink.h> #include <linux/crc32.h> #include <linux/slab.h> #include <linux/export.h> #include <net/mac80211.h> #include <linux/unaligned.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "led.h" #include "fils_aead.h" #include <kunit/static_stub.h> #define IEEE80211_AUTH_TIMEOUT (HZ / 5) #define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2) #define IEEE80211_AUTH_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_AUTH_TIMEOUT_SAE (HZ * 2) #define IEEE80211_AUTH_MAX_TRIES 3 #define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) #define IEEE80211_AUTH_WAIT_SAE_RETRY (HZ * 2) #define IEEE80211_ASSOC_TIMEOUT (HZ / 5) #define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2) #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_ASSOC_MAX_TRIES 3 #define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS msecs_to_jiffies(100) #define IEEE80211_ADV_TTLM_ST_UNDERFLOW 0xff00 #define IEEE80211_NEG_TTLM_REQ_TIMEOUT (HZ / 5) static int max_nullfunc_tries = 2; module_param(max_nullfunc_tries, int, 0644); MODULE_PARM_DESC(max_nullfunc_tries, "Maximum nullfunc tx tries before disconnecting (reason 4)."); static int max_probe_tries = 5; module_param(max_probe_tries, int, 0644); MODULE_PARM_DESC(max_probe_tries, "Maximum probe tries before disconnecting (reason 4)."); /* * Beacon loss timeout is calculated as N frames times the * advertised beacon interval. This may need to be somewhat * higher than what hardware might detect to account for * delays in the host processing frames. But since we also * probe on beacon miss before declaring the connection lost * default to what we want. */ static int beacon_loss_count = 7; module_param(beacon_loss_count, int, 0644); MODULE_PARM_DESC(beacon_loss_count, "Number of beacon intervals before we decide beacon was lost."); /* * Time the connection can be idle before we probe * it to see if we can still talk to the AP. */ #define IEEE80211_CONNECTION_IDLE_TIME (30 * HZ) /* * Time we wait for a probe response after sending * a probe request because of beacon loss or for * checking the connection still works. */ static int probe_wait_ms = 500; module_param(probe_wait_ms, int, 0644); MODULE_PARM_DESC(probe_wait_ms, "Maximum time(ms) to wait for probe response" " before disconnecting (reason 4)."); /* * How many Beacon frames need to have been used in average signal strength * before starting to indicate signal change events. */ #define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 /* * We can have multiple work items (and connection probing) * scheduling this timer, but we need to take care to only * reschedule it when it should fire _earlier_ than it was * asked for before, or if it's not pending right now. This * function ensures that. Note that it then is required to * run this function for all timeouts after the first one * has happened -- the work that runs from this timer will * do that. */ static void run_again(struct ieee80211_sub_if_data *sdata, unsigned long timeout) { lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!timer_pending(&sdata->u.mgd.timer) || time_before(timeout, sdata->u.mgd.timer.expires)) mod_timer(&sdata->u.mgd.timer, timeout); } void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return; if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.bcn_mon_timer, round_jiffies_up(jiffies + sdata->u.mgd.beacon_timeout)); } void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (unlikely(!ifmgd->associated)) return; if (ifmgd->probe_send_count) ifmgd->probe_send_count = 0; if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); } static int ecw2cw(int ecw) { return (1 << ecw) - 1; } static enum ieee80211_conn_mode ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 vht_cap_info, const struct ieee802_11_elems *elems, bool ignore_ht_channel_mismatch, const struct ieee80211_conn_settings *conn, struct cfg80211_chan_def *chandef) { const struct ieee80211_ht_operation *ht_oper = elems->ht_operation; const struct ieee80211_vht_operation *vht_oper = elems->vht_operation; const struct ieee80211_he_operation *he_oper = elems->he_operation; const struct ieee80211_eht_operation *eht_oper = elems->eht_operation; struct ieee80211_supported_band *sband = sdata->local->hw.wiphy->bands[channel->band]; struct cfg80211_chan_def vht_chandef; bool no_vht = false; u32 ht_cfreq; if (ieee80211_hw_check(&sdata->local->hw, STRICT)) ignore_ht_channel_mismatch = false; *chandef = (struct cfg80211_chan_def) { .chan = channel, .width = NL80211_CHAN_WIDTH_20_NOHT, .center_freq1 = channel->center_freq, .freq1_offset = channel->freq_offset, }; /* get special S1G case out of the way */ if (sband->band == NL80211_BAND_S1GHZ) { if (!ieee80211_chandef_s1g_oper(sdata->local, elems->s1g_oper, chandef)) { /* Fallback to default 1MHz */ chandef->width = NL80211_CHAN_WIDTH_1; chandef->s1g_primary_2mhz = false; } return IEEE80211_CONN_MODE_S1G; } /* get special 6 GHz case out of the way */ if (sband->band == NL80211_BAND_6GHZ) { enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_EHT; /* this is an error */ if (conn->mode < IEEE80211_CONN_MODE_HE) return IEEE80211_CONN_MODE_LEGACY; if (!elems->he_6ghz_capa || !elems->he_cap) { sdata_info(sdata, "HE 6 GHz AP is missing HE/HE 6 GHz band capability\n"); return IEEE80211_CONN_MODE_LEGACY; } if (!eht_oper || !elems->eht_cap) { eht_oper = NULL; mode = IEEE80211_CONN_MODE_HE; } if (!ieee80211_chandef_he_6ghz_oper(sdata->local, he_oper, eht_oper, chandef)) { sdata_info(sdata, "bad HE/EHT 6 GHz operation\n"); return IEEE80211_CONN_MODE_LEGACY; } return mode; } /* now we have the progression HT, VHT, ... */ if (conn->mode < IEEE80211_CONN_MODE_HT) return IEEE80211_CONN_MODE_LEGACY; if (!ht_oper || !elems->ht_cap_elem) return IEEE80211_CONN_MODE_LEGACY; chandef->width = NL80211_CHAN_WIDTH_20; ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ if (!ignore_ht_channel_mismatch && channel->center_freq != ht_cfreq) { /* * It's possible that some APs are confused here; * Netgear WNDR3700 sometimes reports 4 higher than * the actual channel in association responses, but * since we look at probe response/beacon data here * it should be OK. */ sdata_info(sdata, "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n", channel->center_freq, ht_cfreq, ht_oper->primary_chan, channel->band); return IEEE80211_CONN_MODE_LEGACY; } ieee80211_chandef_ht_oper(ht_oper, chandef); if (conn->mode < IEEE80211_CONN_MODE_VHT) return IEEE80211_CONN_MODE_HT; vht_chandef = *chandef; /* * having he_cap/he_oper parsed out implies we're at * least operating as HE STA */ if (elems->he_cap && he_oper && he_oper->he_oper_params & cpu_to_le32(IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { struct ieee80211_vht_operation he_oper_vht_cap; /* * Set only first 3 bytes (other 2 aren't used in * ieee80211_chandef_vht_oper() anyway) */ memcpy(&he_oper_vht_cap, he_oper->optional, 3); he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &he_oper_vht_cap, ht_oper, &vht_chandef)) { sdata_info(sdata, "HE AP VHT information is invalid, disabling HE\n"); /* this will cause us to re-parse as VHT STA */ return IEEE80211_CONN_MODE_VHT; } } else if (!vht_oper || !elems->vht_cap_elem) { if (sband->band == NL80211_BAND_5GHZ) { sdata_info(sdata, "VHT information is missing, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } no_vht = true; } else if (sband->band == NL80211_BAND_2GHZ) { no_vht = true; } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, vht_oper, ht_oper, &vht_chandef)) { sdata_info(sdata, "AP VHT information is invalid, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) { sdata_info(sdata, "AP VHT information doesn't match HT, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } *chandef = vht_chandef; /* stick to current max mode if we or the AP don't have HE */ if (conn->mode < IEEE80211_CONN_MODE_HE || !elems->he_operation || !elems->he_cap) { if (no_vht) return IEEE80211_CONN_MODE_HT; return IEEE80211_CONN_MODE_VHT; } /* stick to HE if we or the AP don't have EHT */ if (conn->mode < IEEE80211_CONN_MODE_EHT || !eht_oper || !elems->eht_cap) return IEEE80211_CONN_MODE_HE; /* * handle the case that the EHT operation indicates that it holds EHT * operation information (in case that the channel width differs from * the channel width reported in HT/VHT/HE). */ if (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) { struct cfg80211_chan_def eht_chandef = *chandef; ieee80211_chandef_eht_oper((const void *)eht_oper->optional, &eht_chandef); eht_chandef.punctured = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); if (!cfg80211_chandef_valid(&eht_chandef)) { sdata_info(sdata, "AP EHT information is invalid, disabling EHT\n"); return IEEE80211_CONN_MODE_HE; } if (!cfg80211_chandef_compatible(chandef, &eht_chandef)) { sdata_info(sdata, "AP EHT information doesn't match HT/VHT/HE, disabling EHT\n"); return IEEE80211_CONN_MODE_HE; } *chandef = eht_chandef; } return IEEE80211_CONN_MODE_EHT; } static bool ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_ht_operation *ht_op) { struct ieee80211_sta_ht_cap sta_ht_cap; int i; if (sband->band == NL80211_BAND_6GHZ) return true; if (!ht_op) return false; memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... * If the MLME of an HT STA receives an MLME-JOIN.request primitive * with the SelectedBSS parameter containing a Basic HT-MCS Set field * in the HT Operation parameter that contains any unsupported MCSs, * the MLME response in the resulting MLME-JOIN.confirm primitive shall * contain a ResultCode parameter that is not set to the value SUCCESS. * ... */ /* Simply check that all basic rates are in the STA RX mask */ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { if ((ht_op->basic_set[i] & sta_ht_cap.mcs.rx_mask[i]) != ht_op->basic_set[i]) return false; } return true; } static bool ieee80211_verify_sta_vht_mcs_support(struct ieee80211_sub_if_data *sdata, int link_id, struct ieee80211_supported_band *sband, const struct ieee80211_vht_operation *vht_op) { struct ieee80211_sta_vht_cap sta_vht_cap; u16 ap_min_req_set, sta_rx_mcs_map, sta_tx_mcs_map; int nss; if (sband->band != NL80211_BAND_5GHZ) return true; if (!vht_op) return false; memcpy(&sta_vht_cap, &sband->vht_cap, sizeof(sta_vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &sta_vht_cap); ap_min_req_set = le16_to_cpu(vht_op->basic_mcs_set); sta_rx_mcs_map = le16_to_cpu(sta_vht_cap.vht_mcs.rx_mcs_map); sta_tx_mcs_map = le16_to_cpu(sta_vht_cap.vht_mcs.tx_mcs_map); /* * Many APs are incorrectly advertising an all-zero value here, * which really means MCS 0-7 are required for 1-8 streams, but * they don't really mean it that way. * Some other APs are incorrectly advertising 3 spatial streams * with MCS 0-7 are required, but don't really mean it that way * and we'll connect only with HT, rather than even HE. * As a result, unfortunately the VHT basic MCS/NSS set cannot * be used at all, so check it only in strict mode. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT)) return true; /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... * If the MLME of a VHT STA receives an MLME-JOIN.request primitive * with a SelectedBSS parameter containing a Basic VHT-MCS And NSS Set * field in the VHT Operation parameter that contains any unsupported * <VHT-MCS, NSS> tuple, the MLME response in the resulting * MLME-JOIN.confirm primitive shall contain a ResultCode parameter * that is not set to the value SUCCESS. * ... */ for (nss = 8; nss > 0; nss--) { u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; u8 sta_rx_val; u8 sta_tx_val; if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; sta_rx_val = (sta_rx_mcs_map >> (2 * (nss - 1))) & 3; sta_tx_val = (sta_tx_mcs_map >> (2 * (nss - 1))) & 3; if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_rx_val < ap_op_val || sta_tx_val < ap_op_val) { link_id_info(sdata, link_id, "Missing mandatory rates for %d Nss, rx %d, tx %d oper %d, disable VHT\n", nss, sta_rx_val, sta_tx_val, ap_op_val); return false; } } return true; } static bool ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, int link_id, const struct ieee80211_he_cap_elem *he_cap, const struct ieee80211_he_operation *he_op) { struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp; u16 mcs_80_map_tx, mcs_80_map_rx; u16 ap_min_req_set; int nss; if (!he_cap) return false; /* mcs_nss is right after he_cap info */ he_mcs_nss_supp = (void *)(he_cap + 1); mcs_80_map_tx = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80); mcs_80_map_rx = le16_to_cpu(he_mcs_nss_supp->rx_mcs_80); /* P802.11-REVme/D0.3 * 27.1.1 Introduction to the HE PHY * ... * An HE STA shall support the following features: * ... * Single spatial stream HE-MCSs 0 to 7 (transmit and receive) in all * supported channel widths for HE SU PPDUs */ if ((mcs_80_map_tx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED || (mcs_80_map_rx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED) { link_id_info(sdata, link_id, "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n", mcs_80_map_tx, mcs_80_map_rx); return false; } if (!he_op) return true; ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); /* * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* make sure the AP is consistent with itself * * P802.11-REVme/D0.3 * 26.17.1 Basic HE BSS operation * * A STA that is operating in an HE BSS shall be able to receive and * transmit at each of the <HE-MCS, NSS> tuple values indicated by the * Basic HE-MCS And NSS Set field of the HE Operation parameter of the * MLME-START.request primitive and shall be able to receive at each of * the <HE-MCS, NSS> tuple values indicated by the Supported HE-MCS and * NSS Set field in the HE Capabilities parameter of the MLMESTART.request * primitive */ for (nss = 8; nss > 0; nss--) { u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; u8 ap_rx_val; u8 ap_tx_val; if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; ap_rx_val = (mcs_80_map_rx >> (2 * (nss - 1))) & 3; ap_tx_val = (mcs_80_map_tx >> (2 * (nss - 1))) & 3; if (ap_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || ap_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || ap_rx_val < ap_op_val || ap_tx_val < ap_op_val) { link_id_info(sdata, link_id, "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n", nss, ap_rx_val, ap_tx_val, ap_op_val); return false; } } return true; } static bool ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_he_operation *he_op) { const struct ieee80211_sta_he_cap *sta_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); u16 ap_min_req_set; int i; if (!sta_he_cap || !he_op) return false; ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); /* * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* Need to go over for 80MHz, 160MHz and for 80+80 */ for (i = 0; i < 3; i++) { const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = &sta_he_cap->he_mcs_nss_supp; u16 sta_mcs_map_rx = le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); u16 sta_mcs_map_tx = le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); u8 nss; bool verified = true; /* * For each band there is a maximum of 8 spatial streams * possible. Each of the sta_mcs_map_* is a 16-bit struct built * of 2 bits per NSS (1-8), with the values defined in enum * ieee80211_he_mcs_support. Need to make sure STA TX and RX * capabilities aren't less than the AP's minimum requirements * for this HE BSS per SS. * It is enough to find one such band that meets the reqs. */ for (nss = 8; nss > 0; nss--) { u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; /* * Make sure the HE AP doesn't require MCSs that aren't * supported by the client as required by spec * * P802.11-REVme/D0.3 * 26.17.1 Basic HE BSS operation * * An HE STA shall not attempt to join * (MLME-JOIN.request primitive) * a BSS, unless it supports (i.e., is able to both transmit and * receive using) all of the <HE-MCS, NSS> tuples in the basic * HE-MCS and NSS set. */ if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { verified = false; break; } } if (verified) return true; } /* If here, STA doesn't meet AP's HE min requirements */ return false; } static u8 ieee80211_get_eht_cap_mcs_nss(const struct ieee80211_sta_he_cap *sta_he_cap, const struct ieee80211_sta_eht_cap *sta_eht_cap, unsigned int idx, int bw) { u8 he_phy_cap0 = sta_he_cap->he_cap_elem.phy_cap_info[0]; u8 eht_phy_cap0 = sta_eht_cap->eht_cap_elem.phy_cap_info[0]; /* handle us being a 20 MHz-only EHT STA - with four values * for MCS 0-7, 8-9, 10-11, 12-13. */ if (!(he_phy_cap0 & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL)) return sta_eht_cap->eht_mcs_nss_supp.only_20mhz.rx_tx_max_nss[idx]; /* the others have MCS 0-9 together, rather than separately from 0-7 */ if (idx > 0) idx--; switch (bw) { case 0: return sta_eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_max_nss[idx]; case 1: if (!(he_phy_cap0 & (IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G))) return 0xff; /* pass check */ return sta_eht_cap->eht_mcs_nss_supp.bw._160.rx_tx_max_nss[idx]; case 2: if (!(eht_phy_cap0 & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)) return 0xff; /* pass check */ return sta_eht_cap->eht_mcs_nss_supp.bw._320.rx_tx_max_nss[idx]; } WARN_ON(1); return 0; } static bool ieee80211_verify_sta_eht_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_eht_operation *eht_op) { const struct ieee80211_sta_he_cap *sta_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_sta_eht_cap *sta_eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_eht_mcs_nss_supp_20mhz_only *req; unsigned int i; if (!sta_he_cap || !sta_eht_cap || !eht_op) return false; req = &eht_op->basic_mcs_nss; for (i = 0; i < ARRAY_SIZE(req->rx_tx_max_nss); i++) { u8 req_rx_nss, req_tx_nss; unsigned int bw; req_rx_nss = u8_get_bits(req->rx_tx_max_nss[i], IEEE80211_EHT_MCS_NSS_RX); req_tx_nss = u8_get_bits(req->rx_tx_max_nss[i], IEEE80211_EHT_MCS_NSS_TX); for (bw = 0; bw < 3; bw++) { u8 have, have_rx_nss, have_tx_nss; have = ieee80211_get_eht_cap_mcs_nss(sta_he_cap, sta_eht_cap, i, bw); have_rx_nss = u8_get_bits(have, IEEE80211_EHT_MCS_NSS_RX); have_tx_nss = u8_get_bits(have, IEEE80211_EHT_MCS_NSS_TX); if (req_rx_nss > have_rx_nss || req_tx_nss > have_tx_nss) return false; } } return true; } static void ieee80211_get_rates(struct ieee80211_supported_band *sband, const u8 *supp_rates, unsigned int supp_rates_len, const u8 *ext_supp_rates, unsigned int ext_supp_rates_len, u32 *rates, u32 *basic_rates, unsigned long *unknown_rates_selectors, bool *have_higher_than_11mbit, int *min_rate, int *min_rate_index) { int i, j; for (i = 0; i < supp_rates_len + ext_supp_rates_len; i++) { u8 supp_rate = i < supp_rates_len ? supp_rates[i] : ext_supp_rates[i - supp_rates_len]; int rate = supp_rate & 0x7f; bool is_basic = !!(supp_rate & 0x80); if ((rate * 5) > 110 && have_higher_than_11mbit) *have_higher_than_11mbit = true; /* * Skip membership selectors since they're not rates. * * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly * the same. */ if (is_basic && rate >= BSS_MEMBERSHIP_SELECTOR_MIN) { if (unknown_rates_selectors) set_bit(rate, unknown_rates_selectors); continue; } for (j = 0; j < sband->n_bitrates; j++) { struct ieee80211_rate *br; int brate; br = &sband->bitrates[j]; brate = DIV_ROUND_UP(br->bitrate, 5); if (brate == rate) { if (rates) *rates |= BIT(j); if (is_basic && basic_rates) *basic_rates |= BIT(j); if (min_rate && (rate * 5) < *min_rate) { *min_rate = rate * 5; if (min_rate_index) *min_rate_index = j; } break; } } /* Handle an unknown entry as if it is an unknown selector */ if (is_basic && unknown_rates_selectors && j == sband->n_bitrates) set_bit(rate, unknown_rates_selectors); } } static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 prohibited_flags) { if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, prohibited_flags)) return false; if (chandef->punctured && ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING)) return false; return true; } static int ieee80211_chandef_num_subchans(const struct cfg80211_chan_def *c) { if (c->width == NL80211_CHAN_WIDTH_80P80) return 4 + 4; return cfg80211_chandef_get_width(c) / 20; } static int ieee80211_chandef_num_widths(const struct cfg80211_chan_def *c) { switch (c->width) { case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_20_NOHT: return 1; case NL80211_CHAN_WIDTH_40: return 2; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_80: return 3; case NL80211_CHAN_WIDTH_160: return 4; case NL80211_CHAN_WIDTH_320: return 5; default: WARN_ON(1); return 0; } } VISIBLE_IF_MAC80211_KUNIT int ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap, u8 n_partial_subchans) { int n = ieee80211_chandef_num_subchans(ap); struct cfg80211_chan_def tmp = *ap; int offset = 0; /* * Given a chandef (in this context, it's the AP's) and a number * of subchannels that we want to look at ('n_partial_subchans'), * calculate the offset in number of subchannels between the full * and the subset with the desired width. */ /* same number of subchannels means no offset, obviously */ if (n == n_partial_subchans) return 0; /* don't WARN - misconfigured APs could cause this if their N > width */ if (n < n_partial_subchans) return 0; while (ieee80211_chandef_num_subchans(&tmp) > n_partial_subchans) { u32 prev = tmp.center_freq1; ieee80211_chandef_downgrade(&tmp, NULL); /* * if center_freq moved up, half the original channels * are gone now but were below, so increase offset */ if (prev < tmp.center_freq1) offset += ieee80211_chandef_num_subchans(&tmp); } /* * 80+80 with secondary 80 below primary - four subchannels for it * (we cannot downgrade *to* 80+80, so no need to consider 'tmp') */ if (ap->width == NL80211_CHAN_WIDTH_80P80 && ap->center_freq2 < ap->center_freq1) offset += 4; return offset; } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_calc_chandef_subchan_offset); VISIBLE_IF_MAC80211_KUNIT void ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used) { u8 needed = ieee80211_chandef_num_subchans(used); u8 have = ieee80211_chandef_num_subchans(ap); u8 tmp[IEEE80211_TPE_PSD_ENTRIES_320MHZ]; u8 offset; if (!psd->valid) return; /* if N is zero, all defaults were used, no point in rearranging */ if (!psd->n) goto out; BUILD_BUG_ON(sizeof(tmp) != sizeof(psd->power)); /* * This assumes that 'N' is consistent with the HE channel, as * it should be (otherwise the AP is broken). * * In psd->power we have values in the order 0..N, 0..K, where * N+K should cover the entire channel per 'ap', but even if it * doesn't then we've pre-filled 'unlimited' as defaults. * * But this is all the wrong order, we want to have them in the * order of the 'used' channel. * * So for example, we could have a 320 MHz EHT AP, which has the * HE channel as 80 MHz (e.g. due to puncturing, which doesn't * seem to be considered for the TPE), as follows: * * EHT 320: | | | | | | | | | | | | | | | | | * HE 80: | | | | | * used 160: | | | | | | | | | * * N entries: |--|--|--|--| * K entries: |--|--|--|--|--|--|--|--| |--|--|--|--| * power idx: 4 5 6 7 8 9 10 11 0 1 2 3 12 13 14 15 * full chan: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * used chan: 0 1 2 3 4 5 6 7 * * The idx in the power array ('power idx') is like this since it * comes directly from the element's N and K entries in their * element order, and those are this way for HE compatibility. * * Rearrange them as desired here, first by putting them into the * 'full chan' order, and then selecting the necessary subset for * the 'used chan'. */ /* first reorder according to AP channel */ offset = ieee80211_calc_chandef_subchan_offset(ap, psd->n); for (int i = 0; i < have; i++) { if (i < offset) tmp[i] = psd->power[i + psd->n]; else if (i < offset + psd->n) tmp[i] = psd->power[i - offset]; else tmp[i] = psd->power[i]; } /* * and then select the subset for the used channel * (set everything to defaults first in case a driver is confused) */ memset(psd->power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(psd->power)); offset = ieee80211_calc_chandef_subchan_offset(ap, needed); for (int i = 0; i < needed; i++) psd->power[i] = tmp[offset + i]; out: /* limit, but don't lie if there are defaults in the data */ if (needed < psd->count) psd->count = needed; } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_rearrange_tpe_psd); static void ieee80211_rearrange_tpe(struct ieee80211_parsed_tpe *tpe, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used) { /* ignore this completely for narrow/invalid channels */ if (!ieee80211_chandef_num_subchans(ap) || !ieee80211_chandef_num_subchans(used)) { ieee80211_clear_tpe(tpe); return; } for (int i = 0; i < 2; i++) { int needed_pwr_count; ieee80211_rearrange_tpe_psd(&tpe->psd_local[i], ap, used); ieee80211_rearrange_tpe_psd(&tpe->psd_reg_client[i], ap, used); /* limit this to the widths we actually need */ needed_pwr_count = ieee80211_chandef_num_widths(used); if (needed_pwr_count < tpe->max_local[i].count) tpe->max_local[i].count = needed_pwr_count; if (needed_pwr_count < tpe->max_reg_client[i].count) tpe->max_reg_client[i].count = needed_pwr_count; } } /* * The AP part of the channel request is used to distinguish settings * to the device used for wider bandwidth OFDMA. This is used in the * channel context code to assign two channel contexts even if they're * both for the same channel, if the AP bandwidths are incompatible. * If not EHT (or driver override) then ap.chan == NULL indicates that * there's no wider BW OFDMA used. */ static void ieee80211_set_chanreq_ap(struct ieee80211_sub_if_data *sdata, struct ieee80211_chan_req *chanreq, struct ieee80211_conn_settings *conn, struct cfg80211_chan_def *ap_chandef) { chanreq->ap.chan = NULL; if (conn->mode < IEEE80211_CONN_MODE_EHT) return; if (sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW) return; chanreq->ap = *ap_chandef; } VISIBLE_IF_MAC80211_KUNIT struct ieee802_11_elems * ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, struct cfg80211_bss *cbss, int link_id, struct ieee80211_chan_req *chanreq, struct cfg80211_chan_def *ap_chandef, unsigned long *userspace_selectors) { const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies); struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_channel *channel = cbss->channel; struct ieee80211_elems_parse_params parse_params = { .link_id = -1, .from_ap = true, .start = ies->data, .len = ies->len, }; struct ieee802_11_elems *elems; struct ieee80211_supported_band *sband; enum ieee80211_conn_mode ap_mode; unsigned long unknown_rates_selectors[BITS_TO_LONGS(128)] = {}; unsigned long sta_selectors[BITS_TO_LONGS(128)] = {}; int ret; again: parse_params.mode = conn->mode; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return ERR_PTR(-ENOMEM); ap_mode = ieee80211_determine_ap_chan(sdata, channel, bss->vht_cap_info, elems, false, conn, ap_chandef); /* this should be impossible since parsing depends on our mode */ if (WARN_ON(ap_mode > conn->mode)) { ret = -EINVAL; goto free; } if (conn->mode != ap_mode) { conn->mode = ap_mode; kfree(elems); goto again; } mlme_link_id_dbg(sdata, link_id, "determined AP %pM to be %s\n", cbss->bssid, ieee80211_conn_mode_str(ap_mode)); sband = sdata->local->hw.wiphy->bands[channel->band]; ieee80211_get_rates(sband, elems->supp_rates, elems->supp_rates_len, elems->ext_supp_rates, elems->ext_supp_rates_len, NULL, NULL, unknown_rates_selectors, NULL, NULL, NULL); switch (channel->band) { case NL80211_BAND_S1GHZ: if (WARN_ON(ap_mode != IEEE80211_CONN_MODE_S1G)) { ret = -EINVAL; goto free; } chanreq->oper = *ap_chandef; if (!cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, IEEE80211_CHAN_DISABLED)) { ret = -EINVAL; goto free; } return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { link_id_info(sdata, link_id, "Rejecting non-HE 6/7 GHz connection"); ret = -EINVAL; goto free; } break; default: if (WARN_ON(ap_mode == IEEE80211_CONN_MODE_S1G)) { ret = -EINVAL; goto free; } } switch (ap_mode) { case IEEE80211_CONN_MODE_S1G: WARN_ON(1); ret = -EINVAL; goto free; case IEEE80211_CONN_MODE_LEGACY: conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; case IEEE80211_CONN_MODE_HT: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_40); break; case IEEE80211_CONN_MODE_VHT: case IEEE80211_CONN_MODE_HE: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); break; case IEEE80211_CONN_MODE_EHT: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_320); break; } chanreq->oper = *ap_chandef; bitmap_copy(sta_selectors, userspace_selectors, 128); if (conn->mode >= IEEE80211_CONN_MODE_HT) set_bit(BSS_MEMBERSHIP_SELECTOR_HT_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_VHT) set_bit(BSS_MEMBERSHIP_SELECTOR_VHT_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_HE) set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_EHT) set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, sta_selectors); /* * We do not support EPD or GLK so never add them. * SAE_H2E is handled through userspace_selectors. */ /* Check if we support all required features */ if (!bitmap_subset(unknown_rates_selectors, sta_selectors, 128)) { link_id_info(sdata, link_id, "required basic rate or BSS membership selectors not supported or disabled, rejecting connection\n"); ret = -EINVAL; goto free; } ieee80211_set_chanreq_ap(sdata, chanreq, conn, ap_chandef); while (!ieee80211_chandef_usable(sdata, &chanreq->oper, IEEE80211_CHAN_DISABLED)) { if (WARN_ON(chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT)) { ret = -EINVAL; goto free; } ieee80211_chanreq_downgrade(chanreq, conn); } if (conn->mode >= IEEE80211_CONN_MODE_HE && !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, IEEE80211_CHAN_NO_HE)) { conn->mode = IEEE80211_CONN_MODE_VHT; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, IEEE80211_CHAN_NO_EHT)) { conn->mode = IEEE80211_CONN_MODE_HE; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); } if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode) link_id_info(sdata, link_id, "regulatory prevented using AP config, downgraded\n"); if (conn->mode >= IEEE80211_CONN_MODE_HT && !ieee80211_verify_sta_ht_mcs_support(sdata, sband, elems->ht_operation)) { conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; link_id_info(sdata, link_id, "required MCSes not supported, disabling HT\n"); } if (conn->mode >= IEEE80211_CONN_MODE_VHT && !ieee80211_verify_sta_vht_mcs_support(sdata, link_id, sband, elems->vht_operation)) { conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_40); link_id_info(sdata, link_id, "required MCSes not supported, disabling VHT\n"); } if (conn->mode >= IEEE80211_CONN_MODE_HE && (!ieee80211_verify_peer_he_mcs_support(sdata, link_id, (void *)elems->he_cap, elems->he_operation) || !ieee80211_verify_sta_he_mcs_support(sdata, sband, elems->he_operation))) { conn->mode = IEEE80211_CONN_MODE_VHT; link_id_info(sdata, link_id, "required MCSes not supported, disabling HE\n"); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && !ieee80211_verify_sta_eht_mcs_support(sdata, sband, elems->eht_operation)) { conn->mode = IEEE80211_CONN_MODE_HE; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); link_id_info(sdata, link_id, "required MCSes not supported, disabling EHT\n"); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && channel->band != NL80211_BAND_2GHZ && conn->bw_limit == IEEE80211_CONN_BW_LIMIT_40) { conn->mode = IEEE80211_CONN_MODE_HE; link_id_info(sdata, link_id, "required bandwidth not supported, disabling EHT\n"); } /* the mode can only decrease, so this must terminate */ if (ap_mode != conn->mode) { kfree(elems); goto again; } mlme_link_id_dbg(sdata, link_id, "connecting with %s mode, max bandwidth %d MHz\n", ieee80211_conn_mode_str(conn->mode), 20 * (1 << conn->bw_limit)); if (WARN_ON_ONCE(!cfg80211_chandef_valid(&chanreq->oper))) { ret = -EINVAL; goto free; } return elems; free: kfree(elems); return ERR_PTR(ret); } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_determine_chan_mode); static int ieee80211_config_bw(struct ieee80211_link_data *link, struct ieee802_11_elems *elems, bool update, u64 *changed, u16 stype) { struct ieee80211_channel *channel = link->conf->chanreq.oper.chan; struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef; enum ieee80211_conn_mode ap_mode; const char *frame; u32 vht_cap_info = 0; u16 ht_opmode; int ret; switch (stype) { case IEEE80211_STYPE_BEACON: frame = "beacon"; break; case IEEE80211_STYPE_ASSOC_RESP: frame = "assoc response"; break; case IEEE80211_STYPE_REASSOC_RESP: frame = "reassoc response"; break; case IEEE80211_STYPE_ACTION: /* the only action frame that gets here */ frame = "ML reconf response"; break; default: return -EINVAL; } /* don't track any bandwidth changes in legacy/S1G modes */ if (link->u.mgd.conn.mode == IEEE80211_CONN_MODE_LEGACY || link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G) return 0; if (elems->vht_cap_elem) vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); ap_mode = ieee80211_determine_ap_chan(sdata, channel, vht_cap_info, elems, true, &link->u.mgd.conn, &ap_chandef); if (ap_mode != link->u.mgd.conn.mode) { link_info(link, "AP %pM appears to change mode (expected %s, found %s) in %s, disconnect\n", link->u.mgd.bssid, ieee80211_conn_mode_str(link->u.mgd.conn.mode), ieee80211_conn_mode_str(ap_mode), frame); return -EINVAL; } chanreq.oper = ap_chandef; ieee80211_set_chanreq_ap(sdata, &chanreq, &link->u.mgd.conn, &ap_chandef); /* * if HT operation mode changed store the new one - * this may be applicable even if channel is identical */ if (elems->ht_operation) { ht_opmode = le16_to_cpu(elems->ht_operation->operation_mode); if (link->conf->ht_operation_mode != ht_opmode) { *changed |= BSS_CHANGED_HT; link->conf->ht_operation_mode = ht_opmode; } } /* * Downgrade the new channel if we associated with restricted * bandwidth capabilities. For example, if we associated as a * 20 MHz STA to a 40 MHz AP (due to regulatory, capabilities * or config reasons) then switching to a 40 MHz channel now * won't do us any good -- we couldn't use it with the AP. */ while (link->u.mgd.conn.bw_limit < ieee80211_min_bw_limit_from_chandef(&chanreq.oper)) ieee80211_chandef_downgrade(&chanreq.oper, NULL); /* TPE element is not present in (re)assoc/ML reconfig response */ if (stype == IEEE80211_STYPE_BEACON && ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_rearrange_tpe(&elems->tpe, &ap_chandef, &chanreq.oper); if (memcmp(&link->conf->tpe, &elems->tpe, sizeof(elems->tpe))) { link->conf->tpe = elems->tpe; *changed |= BSS_CHANGED_TPE; } } if (ieee80211_chanreq_identical(&chanreq, &link->conf->chanreq)) return 0; link_info(link, "AP %pM changed bandwidth in %s, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n", link->u.mgd.bssid, frame, chanreq.oper.chan->center_freq, chanreq.oper.chan->freq_offset, chanreq.oper.width, chanreq.oper.center_freq1, chanreq.oper.freq1_offset, chanreq.oper.center_freq2); if (!cfg80211_chandef_valid(&chanreq.oper)) { sdata_info(sdata, "AP %pM changed caps/bw in %s in a way we can't support - disconnect\n", link->u.mgd.bssid, frame); return -EINVAL; } if (!update) { link->conf->chanreq = chanreq; return 0; } /* * We're tracking the current AP here, so don't do any further checks * here. This keeps us from playing ping-pong with regulatory, without * it the following can happen (for example): * - connect to an AP with 80 MHz, world regdom allows 80 MHz * - AP advertises regdom US * - CRDA loads regdom US with 80 MHz prohibited (old database) * - we detect an unsupported channel and disconnect * - disconnect causes CRDA to reload world regdomain and the game * starts anew. * (see https://bugzilla.kernel.org/show_bug.cgi?id=70881) * * It seems possible that there are still scenarios with CSA or real * bandwidth changes where a this could happen, but those cases are * less common and wouldn't completely prevent using the AP. */ ret = ieee80211_link_change_chanreq(link, &chanreq, changed); if (ret) { sdata_info(sdata, "AP %pM changed bandwidth in %s to incompatible one - disconnect\n", link->u.mgd.bssid, frame); return ret; } cfg80211_schedule_channels_check(&sdata->wdev); return 0; } /* frame sending functions */ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u8 ap_ht_param, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, enum ieee80211_smps_mode smps, const struct ieee80211_conn_settings *conn) { u8 *pos; u32 flags = channel->flags; u16 cap; struct ieee80211_sta_ht_cap ht_cap; BUILD_BUG_ON(sizeof(ht_cap) != sizeof(sband->ht_cap)); memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap)); ieee80211_apply_htcap_overrides(sdata, &ht_cap); /* determine capability flags */ cap = ht_cap.cap; switch (ap_ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: if (flags & IEEE80211_CHAN_NO_HT40PLUS) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: if (flags & IEEE80211_CHAN_NO_HT40MINUS) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } break; } /* * If 40 MHz was disabled associate as though we weren't * capable of 40 MHz -- some broken APs will never fall * back to trying to transmit in 20 MHz. */ if (conn->bw_limit <= IEEE80211_CONN_BW_LIMIT_20) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } /* set SM PS mode properly */ cap &= ~IEEE80211_HT_CAP_SM_PS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: cap |= WLAN_HT_CAP_SM_PS_DISABLED << IEEE80211_HT_CAP_SM_PS_SHIFT; break; case IEEE80211_SMPS_STATIC: cap |= WLAN_HT_CAP_SM_PS_STATIC << IEEE80211_HT_CAP_SM_PS_SHIFT; break; case IEEE80211_SMPS_DYNAMIC: cap |= WLAN_HT_CAP_SM_PS_DYNAMIC << IEEE80211_HT_CAP_SM_PS_SHIFT; break; } /* reserve and fill IE */ pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); ieee80211_ie_build_ht_cap(pos, &ht_cap, cap); } /* This function determines vht capability flags for the association * and builds the IE. * Note - the function returns true to own the MU-MIMO capability */ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_supported_band *sband, struct ieee80211_vht_cap *ap_vht_cap, const struct ieee80211_conn_settings *conn) { struct ieee80211_local *local = sdata->local; u8 *pos; u32 cap; struct ieee80211_sta_vht_cap vht_cap; u32 mask, ap_bf_sts, our_bf_sts; bool mu_mimo_owner = false; BUILD_BUG_ON(sizeof(vht_cap) != sizeof(sband->vht_cap)); memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); /* determine capability flags */ cap = vht_cap.cap; if (conn->bw_limit <= IEEE80211_CONN_BW_LIMIT_80) { cap &= ~IEEE80211_VHT_CAP_SHORT_GI_160; cap &= ~IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; } /* * Some APs apparently get confused if our capabilities are better * than theirs, so restrict what we advertise in the assoc request. */ if (!ieee80211_hw_check(&local->hw, STRICT)) { if (!(ap_vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); else if (!(ap_vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; } /* * If some other vif is using the MU-MIMO capability we cannot associate * using MU-MIMO - this will lead to contradictions in the group-id * mechanism. * Ownership is defined since association request, in order to avoid * simultaneous associations with MU-MIMO. */ if (cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) { bool disable_mu_mimo = false; struct ieee80211_sub_if_data *other; list_for_each_entry(other, &local->interfaces, list) { if (other->vif.bss_conf.mu_mimo_owner) { disable_mu_mimo = true; break; } } if (disable_mu_mimo) cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; else mu_mimo_owner = true; } mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK; ap_bf_sts = le32_to_cpu(ap_vht_cap->vht_cap_info) & mask; our_bf_sts = cap & mask; if (ap_bf_sts < our_bf_sts) { cap &= ~mask; cap |= ap_bf_sts; } /* reserve and fill IE */ pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); return mu_mimo_owner; } static void ieee80211_assoc_add_rates(struct ieee80211_local *local, struct sk_buff *skb, enum nl80211_chan_width width, struct ieee80211_supported_band *sband, struct ieee80211_mgd_assoc_data *assoc_data) { u32 rates; if (assoc_data->supp_rates_len && !ieee80211_hw_check(&local->hw, STRICT)) { /* * Get all rates supported by the device and the AP as * some APs don't like getting a superset of their rates * in the association request (e.g. D-Link DAP 1353 in * b-only mode)... */ ieee80211_parse_bitrates(width, sband, assoc_data->supp_rates, assoc_data->supp_rates_len, &rates); } else { /* * In case AP not provide any supported rates information * before association, we send information element(s) with * all rates that we support. */ rates = ~0; } ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_SUPP_RATES); ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_EXT_SUPP_RATES); } static size_t ieee80211_add_before_ht_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { size_t noffset; static const u8 before_ht[] = { WLAN_EID_SSID, WLAN_EID_SUPP_RATES, WLAN_EID_EXT_SUPP_RATES, WLAN_EID_PWR_CAPABILITY, WLAN_EID_SUPPORTED_CHANNELS, WLAN_EID_RSN, WLAN_EID_QOS_CAPA, WLAN_EID_RRM_ENABLED_CAPABILITIES, WLAN_EID_MOBILITY_DOMAIN, WLAN_EID_FAST_BSS_TRANSITION, /* reassoc only */ WLAN_EID_RIC_DATA, /* reassoc only */ WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; static const u8 after_ric[] = { WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_HT_CAPABILITY, WLAN_EID_BSS_COEX_2040, /* luckily this is almost always there */ WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ WLAN_EID_VHT_CAPABILITY, WLAN_EID_OPMODE_NOTIF, }; if (!elems_len) return offset; noffset = ieee80211_ie_split_ric(elems, elems_len, before_ht, ARRAY_SIZE(before_ht), after_ric, ARRAY_SIZE(after_ric), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } static size_t ieee80211_add_before_vht_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { static const u8 before_vht[] = { /* * no need to list the ones split off before HT * or generated here */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; size_t noffset; if (!elems_len) return offset; /* RIC already taken care of in ieee80211_add_before_ht_elems() */ noffset = ieee80211_ie_split(elems, elems_len, before_vht, ARRAY_SIZE(before_vht), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } static size_t ieee80211_add_before_he_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { static const u8 before_he[] = { /* * no need to list the ones split off before VHT * or generated here */ WLAN_EID_OPMODE_NOTIF, WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE, /* 11ai elements */ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN, /* TODO: add 11ah/11aj/11ak elements */ }; size_t noffset; if (!elems_len) return offset; /* RIC already taken care of in ieee80211_add_before_ht_elems() */ noffset = ieee80211_ie_split(elems, elems_len, before_he, ARRAY_SIZE(before_he), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } static size_t ieee80211_add_before_reg_conn(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { static const u8 before_reg_conn[] = { /* * no need to list the ones split off before HE * or generated here */ WLAN_EID_EXTENSION, WLAN_EID_EXT_DH_PARAMETER, WLAN_EID_EXTENSION, WLAN_EID_EXT_KNOWN_STA_IDENTIFCATION, }; size_t noffset; if (!elems_len) return offset; noffset = ieee80211_ie_split(elems, elems_len, before_reg_conn, ARRAY_SIZE(before_reg_conn), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } #define PRESENT_ELEMS_MAX 8 #define PRESENT_ELEM_EXT_OFFS 0x100 static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 capab, const struct element *ext_capa, const u16 *present_elems, struct ieee80211_mgd_assoc_data *assoc_data); static size_t ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 *capab, const struct element *ext_capa, const u8 *extra_elems, size_t extra_elems_len, unsigned int link_id, struct ieee80211_link_data *link, u16 *present_elems, struct ieee80211_mgd_assoc_data *assoc_data) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_channel *chan = cbss->channel; const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20; struct ieee80211_chanctx_conf *chanctx_conf; enum ieee80211_smps_mode smps_mode; u16 orig_capab = *capab; size_t offset = 0; int present_elems_len = 0; u8 *pos; int i; #define ADD_PRESENT_ELEM(id) do { \ /* need a last for termination - we use 0 == SSID */ \ if (!WARN_ON(present_elems_len >= PRESENT_ELEMS_MAX - 1)) \ present_elems[present_elems_len++] = (id); \ } while (0) #define ADD_PRESENT_EXT_ELEM(id) ADD_PRESENT_ELEM(PRESENT_ELEM_EXT_OFFS | (id)) if (link) smps_mode = link->smps_mode; else if (sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_DYNAMIC; else smps_mode = IEEE80211_SMPS_OFF; if (link) { /* * 5/10 MHz scenarios are only viable without MLO, in which * case this pointer should be used ... All of this is a bit * unclear though, not sure this even works at all. */ rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (chanctx_conf) width = chanctx_conf->def.width; rcu_read_unlock(); } sband = local->hw.wiphy->bands[chan->band]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (sband->band == NL80211_BAND_2GHZ) { *capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; *capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } if ((cbss->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && ieee80211_hw_check(&local->hw, SPECTRUM_MGMT)) *capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; if (sband->band != NL80211_BAND_S1GHZ) ieee80211_assoc_add_rates(local, skb, width, sband, assoc_data); if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT || *capab & WLAN_CAPABILITY_RADIO_MEASURE) { struct cfg80211_chan_def chandef = { .width = width, .chan = chan, }; pos = skb_put(skb, 4); *pos++ = WLAN_EID_PWR_CAPABILITY; *pos++ = 2; *pos++ = 0; /* min tx power */ /* max tx power */ *pos++ = ieee80211_chandef_max_power(&chandef); ADD_PRESENT_ELEM(WLAN_EID_PWR_CAPABILITY); } /* * Per spec, we shouldn't include the list of channels if we advertise * support for extended channel switching, but we've always done that; * (for now?) apply this restriction only on the (new) 6 GHz band. */ if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT && (sband->band != NL80211_BAND_6GHZ || !ext_capa || ext_capa->datalen < 1 || !(ext_capa->data[0] & WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING))) { /* TODO: get this in reg domain format */ pos = skb_put(skb, 2 * sband->n_channels + 2); *pos++ = WLAN_EID_SUPPORTED_CHANNELS; *pos++ = 2 * sband->n_channels; for (i = 0; i < sband->n_channels; i++) { int cf = sband->channels[i].center_freq; *pos++ = ieee80211_frequency_to_channel(cf); *pos++ = 1; /* one channel in the subband*/ } ADD_PRESENT_ELEM(WLAN_EID_SUPPORTED_CHANNELS); } /* if present, add any custom IEs that go before HT */ offset = ieee80211_add_before_ht_elems(skb, extra_elems, extra_elems_len, offset); if (sband->band != NL80211_BAND_6GHZ && assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_HT) { ieee80211_add_ht_ie(sdata, skb, assoc_data->link[link_id].ap_ht_param, sband, chan, smps_mode, &assoc_data->link[link_id].conn); ADD_PRESENT_ELEM(WLAN_EID_HT_CAPABILITY); } /* if present, add any custom IEs that go before VHT */ offset = ieee80211_add_before_vht_elems(skb, extra_elems, extra_elems_len, offset); if (sband->band != NL80211_BAND_6GHZ && assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_VHT && sband->vht_cap.vht_supported) { bool mu_mimo_owner = ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->link[link_id].ap_vht_cap, &assoc_data->link[link_id].conn); if (link) link->conf->mu_mimo_owner = mu_mimo_owner; ADD_PRESENT_ELEM(WLAN_EID_VHT_CAPABILITY); } /* if present, add any custom IEs that go before HE */ offset = ieee80211_add_before_he_elems(skb, extra_elems, extra_elems_len, offset); if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_put_he_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_HE_CAPABILITY); if (sband->band == NL80211_BAND_6GHZ) ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); } /* * if present, add any custom IEs that go before regulatory * connectivity element */ offset = ieee80211_add_before_reg_conn(skb, extra_elems, extra_elems_len, offset); if (sband->band == NL80211_BAND_6GHZ) { /* * as per Section E.2.7 of IEEE 802.11 REVme D7.0, non-AP STA * capable of operating on the 6 GHz band shall transmit * regulatory connectivity element. */ ieee80211_put_reg_conn(skb, chan->flags); } /* * careful - need to know about all the present elems before * calling ieee80211_assoc_add_ml_elem(), so add this one if * we're going to put it after the ML element */ if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_EHT_CAPABILITY); if (link_id == assoc_data->assoc_link_id) ieee80211_assoc_add_ml_elem(sdata, skb, orig_capab, ext_capa, present_elems, assoc_data); /* crash if somebody gets it wrong */ present_elems = NULL; if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ieee80211_put_eht_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_add_aid_request_ie(sdata, skb); ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb); } if (iftd && iftd->vendor_elems.data && iftd->vendor_elems.len) skb_put_data(skb, iftd->vendor_elems.data, iftd->vendor_elems.len); return offset; } static void ieee80211_add_non_inheritance_elem(struct sk_buff *skb, const u16 *outer, const u16 *inner) { unsigned int skb_len = skb->len; bool at_extension = false; bool added = false; int i, j; u8 *len, *list_len = NULL; skb_put_u8(skb, WLAN_EID_EXTENSION); len = skb_put(skb, 1); skb_put_u8(skb, WLAN_EID_EXT_NON_INHERITANCE); for (i = 0; i < PRESENT_ELEMS_MAX && outer[i]; i++) { u16 elem = outer[i]; bool have_inner = false; /* should at least be sorted in the sense of normal -> ext */ WARN_ON(at_extension && elem < PRESENT_ELEM_EXT_OFFS); /* switch to extension list */ if (!at_extension && elem >= PRESENT_ELEM_EXT_OFFS) { at_extension = true; if (!list_len) skb_put_u8(skb, 0); list_len = NULL; } for (j = 0; j < PRESENT_ELEMS_MAX && inner[j]; j++) { if (elem == inner[j]) { have_inner = true; break; } } if (have_inner) continue; if (!list_len) { list_len = skb_put(skb, 1); *list_len = 0; } *list_len += 1; skb_put_u8(skb, (u8)elem); added = true; } /* if we added a list but no extension list, make a zero-len one */ if (added && (!at_extension || !list_len)) skb_put_u8(skb, 0); /* if nothing added remove extension element completely */ if (!added) skb_trim(skb, skb_len); else *len = skb->len - skb_len - 2; } static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 capab, const struct element *ext_capa, const u16 *outer_present_elems, struct ieee80211_mgd_assoc_data *assoc_data) { struct ieee80211_local *local = sdata->local; struct ieee80211_multi_link_elem *ml_elem; struct ieee80211_mle_basic_common_info *common; const struct wiphy_iftype_ext_capab *ift_ext_capa; __le16 eml_capa = 0, mld_capa_ops = 0; unsigned int link_id; u8 *ml_elem_len; void *capab_pos; if (!ieee80211_vif_is_mld(&sdata->vif)) return; ift_ext_capa = cfg80211_get_iftype_ext_capa(local->hw.wiphy, ieee80211_vif_type_p2p(&sdata->vif)); if (ift_ext_capa) { eml_capa = cpu_to_le16(ift_ext_capa->eml_capabilities); mld_capa_ops = cpu_to_le16(ift_ext_capa->mld_capa_and_ops); } skb_put_u8(skb, WLAN_EID_EXTENSION); ml_elem_len = skb_put(skb, 1); skb_put_u8(skb, WLAN_EID_EXT_EHT_MULTI_LINK); ml_elem = skb_put(skb, sizeof(*ml_elem)); ml_elem->control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC | IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP); common = skb_put(skb, sizeof(*common)); common->len = sizeof(*common) + 2; /* MLD capa/ops */ memcpy(common->mld_mac_addr, sdata->vif.addr, ETH_ALEN); /* add EML_CAPA only if needed, see Draft P802.11be_D2.1, 35.3.17 */ if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | IEEE80211_EML_CAP_EMLMR_SUPPORT))) { common->len += 2; /* EML capabilities */ ml_elem->control |= cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EML_CAPA); skb_put_data(skb, &eml_capa, sizeof(eml_capa)); } skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); if (assoc_data->ext_mld_capa_ops) { ml_elem->control |= cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP); common->len += 2; skb_put_data(skb, &assoc_data->ext_mld_capa_ops, sizeof(assoc_data->ext_mld_capa_ops)); } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { u16 link_present_elems[PRESENT_ELEMS_MAX] = {}; const u8 *extra_elems; size_t extra_elems_len; size_t extra_used; u8 *subelem_len = NULL; __le16 ctrl; if (!assoc_data->link[link_id].bss || link_id == assoc_data->assoc_link_id) continue; extra_elems = assoc_data->link[link_id].elems; extra_elems_len = assoc_data->link[link_id].elems_len; skb_put_u8(skb, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE); subelem_len = skb_put(skb, 1); ctrl = cpu_to_le16(link_id | IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE | IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT); skb_put_data(skb, &ctrl, sizeof(ctrl)); skb_put_u8(skb, 1 + ETH_ALEN); /* STA Info Length */ skb_put_data(skb, assoc_data->link[link_id].addr, ETH_ALEN); /* * Now add the contents of the (re)association request, * but the "listen interval" and "current AP address" * (if applicable) are skipped. So we only have * the capability field (remember the position and fill * later), followed by the elements added below by * calling ieee80211_add_link_elems(). */ capab_pos = skb_put(skb, 2); extra_used = ieee80211_add_link_elems(sdata, skb, &capab, ext_capa, extra_elems, extra_elems_len, link_id, NULL, link_present_elems, assoc_data); if (extra_elems) skb_put_data(skb, extra_elems + extra_used, extra_elems_len - extra_used); put_unaligned_le16(capab, capab_pos); ieee80211_add_non_inheritance_elem(skb, outer_present_elems, link_present_elems); ieee80211_fragment_element(skb, subelem_len, IEEE80211_MLE_SUBELEM_FRAGMENT); } ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT); } static int ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype iftype, struct cfg80211_bss *cbss, size_t elems_len) { struct ieee80211_local *local = sdata->local; const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_supported_band *sband; size_t size = 0; if (!cbss) return size; sband = local->hw.wiphy->bands[cbss->channel->band]; /* add STA profile elements length */ size += elems_len; /* and supported rates length */ size += 4 + sband->n_bitrates; /* supported channels */ size += 2 + 2 * sband->n_channels; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (iftd) size += iftd->vendor_elems.len; /* power capability */ size += 4; /* HT, VHT, HE, EHT */ size += 2 + sizeof(struct ieee80211_ht_cap); size += 2 + sizeof(struct ieee80211_vht_cap); size += 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN; if (sband->band == NL80211_BAND_6GHZ) { size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); /* reg connection */ size += 4; } size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) + sizeof(struct ieee80211_eht_mcs_nss_supp) + IEEE80211_EHT_PPE_THRES_MAX_LEN; return size; } static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; struct ieee80211_link_data *link; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *pos, qos_info, *ie_start; size_t offset, noffset; u16 capab = 0, link_capab; __le16 listen_int; struct element *ext_capa = NULL; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); struct ieee80211_prep_tx_info info = {}; unsigned int link_id, n_links = 0; u16 present_elems[PRESENT_ELEMS_MAX] = {}; void *capab_pos; size_t size; int ret; /* we know it's writable, cast away the const */ if (assoc_data->ie_len) ext_capa = (void *)cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, assoc_data->ie, assoc_data->ie_len); lockdep_assert_wiphy(sdata->local->hw.wiphy); size = local->hw.extra_tx_headroom + sizeof(*mgmt) + /* bit too much but doesn't matter */ 2 + assoc_data->ssid_len + /* SSID */ assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9; /* WMM */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; size_t elems_len = assoc_data->link[link_id].elems_len; if (!cbss) continue; n_links++; size += ieee80211_link_common_elems_size(sdata, iftype, cbss, elems_len); /* non-inheritance element */ size += 2 + 2 + PRESENT_ELEMS_MAX; /* should be the same across all BSSes */ if (cbss->capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; } if (ieee80211_vif_is_mld(&sdata->vif)) { /* consider the multi-link element with STA profile */ size += sizeof(struct ieee80211_multi_link_elem); /* max common info field in basic multi-link element */ size += sizeof(struct ieee80211_mle_basic_common_info) + 2 + /* capa & op */ 2 + /* ext capa & op */ 2; /* EML capa */ /* The capability elements were already considered above */ size += (n_links - 1) * (1 + 1 + /* subelement ID/length */ 2 + /* STA control */ 1 + ETH_ALEN + 2 /* STA Info field */); } link = sdata_dereference(sdata->link[assoc_data->assoc_link_id], sdata); if (WARN_ON(!link)) return -EINVAL; if (WARN_ON(!assoc_data->link[assoc_data->assoc_link_id].bss)) return -EINVAL; skb = alloc_skb(size, GFP_KERNEL); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM) capab |= WLAN_CAPABILITY_RADIO_MEASURE; /* Set MBSSID support for HE AP if needed */ if (ieee80211_hw_check(&local->hw, SUPPORTS_ONLY_HE_MULTI_BSSID) && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && ext_capa && ext_capa->datalen >= 3) ext_capa->data[2] |= WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT; mgmt = skb_put_zero(skb, 24); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); listen_int = cpu_to_le16(assoc_data->s1g ? ieee80211_encode_usf(local->hw.conf.listen_interval) : local->hw.conf.listen_interval); if (!is_zero_ether_addr(assoc_data->prev_ap_addr)) { skb_put(skb, 10); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ); capab_pos = &mgmt->u.reassoc_req.capab_info; mgmt->u.reassoc_req.listen_interval = listen_int; memcpy(mgmt->u.reassoc_req.current_ap, assoc_data->prev_ap_addr, ETH_ALEN); info.subtype = IEEE80211_STYPE_REASSOC_REQ; } else { skb_put(skb, 4); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ); capab_pos = &mgmt->u.assoc_req.capab_info; mgmt->u.assoc_req.listen_interval = listen_int; info.subtype = IEEE80211_STYPE_ASSOC_REQ; } /* SSID */ pos = skb_put(skb, 2 + assoc_data->ssid_len); ie_start = pos; *pos++ = WLAN_EID_SSID; *pos++ = assoc_data->ssid_len; memcpy(pos, assoc_data->ssid, assoc_data->ssid_len); /* * This bit is technically reserved, so it shouldn't matter for either * the AP or us, but it also means we shouldn't set it. However, we've * always set it in the past, and apparently some EHT APs check that * we don't set it. To avoid interoperability issues with old APs that * for some reason check it and want it to be set, set the bit for all * pre-EHT connections as we used to do. */ if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT && !ieee80211_hw_check(&local->hw, STRICT)) capab |= WLAN_CAPABILITY_ESS; /* add the elements for the assoc (main) link */ link_capab = capab; offset = ieee80211_add_link_elems(sdata, skb, &link_capab, ext_capa, assoc_data->ie, assoc_data->ie_len, assoc_data->assoc_link_id, link, present_elems, assoc_data); put_unaligned_le16(link_capab, capab_pos); /* if present, add any custom non-vendor IEs */ if (assoc_data->ie_len) { noffset = ieee80211_ie_split_vendor(assoc_data->ie, assoc_data->ie_len, offset); skb_put_data(skb, assoc_data->ie + offset, noffset - offset); offset = noffset; } if (assoc_data->wmm) { if (assoc_data->uapsd) { qos_info = ifmgd->uapsd_queues; qos_info |= (ifmgd->uapsd_max_sp_len << IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); } else { qos_info = 0; } pos = ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info); } /* add any remaining custom (i.e. vendor specific here) IEs */ if (assoc_data->ie_len) { noffset = assoc_data->ie_len; skb_put_data(skb, assoc_data->ie + offset, noffset - offset); } if (assoc_data->fils_kek_len) { ret = fils_encrypt_assoc_req(skb, assoc_data); if (ret < 0) { dev_kfree_skb(skb); return ret; } } pos = skb_tail_pointer(skb); kfree(ifmgd->assoc_req_ies); ifmgd->assoc_req_ies = kmemdup(ie_start, pos - ie_start, GFP_ATOMIC); if (!ifmgd->assoc_req_ies) { dev_kfree_skb(skb); return -ENOMEM; } ifmgd->assoc_req_ies_len = pos - ie_start; info.link_id = assoc_data->assoc_link_id; drv_mgd_prepare_tx(local, sdata, &info); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_tx_skb(sdata, skb); return 0; } void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_pspoll *pspoll; struct sk_buff *skb; skb = ieee80211_pspoll_get(&local->hw, &sdata->vif); if (!skb) return; pspoll = (struct ieee80211_pspoll *) skb->data; pspoll->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave) { struct sk_buff *skb; struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, -1, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) return; nullfunc = (struct ieee80211_hdr_3addr *) skb->data; if (powersave) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_USE_MINRATE; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct sk_buff *skb; struct ieee80211_hdr *nullfunc; __le16 fc; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put_zero(skb, 30); fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); nullfunc->frame_control = fc; memcpy(nullfunc->addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(nullfunc->addr4, sdata->vif.addr, ETH_ALEN); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_USE_MINRATE; ieee80211_tx_skb(sdata, skb); } /* spectrum management related things */ static void ieee80211_csa_switch_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, u.mgd.csa.switch_work.work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ret; if (!ieee80211_sdata_running(sdata)) return; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; if (!link->conf->csa_active) return; /* * If the link isn't active (now), we cannot wait for beacons, won't * have a reserved chanctx, etc. Just switch over the chandef and * update cfg80211 directly. */ if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) { struct link_sta_info *link_sta; struct sta_info *ap_sta; link->conf->chanreq = link->csa.chanreq; cfg80211_ch_switch_notify(sdata->dev, &link->csa.chanreq.oper, link->link_id); link->conf->csa_active = false; ap_sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!ap_sta)) return; link_sta = wiphy_dereference(wiphy, ap_sta->link[link->link_id]); if (WARN_ON(!link_sta)) return; link_sta->pub->bandwidth = _ieee80211_sta_cur_vht_bw(link_sta, &link->csa.chanreq.oper); return; } /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the * work with no reserved_chanctx so verify chandef to check if it * completed successfully */ if (link->reserved_chanctx) { /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their * reservations */ if (link->reserved_ready) return; ret = ieee80211_link_use_reserved_context(link); if (ret) { link_info(link, "failed to use reserved channel context, disconnecting (err=%d)\n", ret); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); } return; } if (!ieee80211_chanreq_identical(&link->conf->chanreq, &link->csa.chanreq)) { link_info(link, "failed to finalize channel switch, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } link->u.mgd.csa.waiting_bcn = true; /* apply new TPE restrictions immediately on the new channel */ if (link->u.mgd.csa.ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_rearrange_tpe(&link->u.mgd.csa.tpe, &link->u.mgd.csa.ap_chandef, &link->conf->chanreq.oper); if (memcmp(&link->conf->tpe, &link->u.mgd.csa.tpe, sizeof(link->u.mgd.csa.tpe))) { link->conf->tpe = link->u.mgd.csa.tpe; ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_TPE); } } /* * It is not necessary to reset these timers if any link does not * have an active CSA and that link still receives the beacons * when other links have active CSA. */ for_each_link_data(sdata, link) { if (!link->conf->csa_active) return; } /* * Reset the beacon monitor and connection monitor timers when CSA * is active for all links in MLO when channel switch occurs in all * the links. */ ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!link->conf->csa_active); ieee80211_vif_unblock_queues_csa(sdata); link->conf->csa_active = false; link->u.mgd.csa.blocked_tx = false; link->u.mgd.csa.waiting_bcn = false; ret = drv_post_channel_switch(link); if (ret) { link_info(link, "driver post channel switch failed, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } cfg80211_ch_switch_notify(sdata->dev, &link->conf->chanreq.oper, link->link_id); } void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_chswitch_done(sdata, success, link_id); rcu_read_lock(); if (!success) { sdata_info(sdata, "driver channel switch failed (link %d), disconnecting\n", link_id); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.csa_connection_drop_work); } else { struct ieee80211_link_data *link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } wiphy_delayed_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); } rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_chswitch_done); static void ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->abort_channel_switch) return; if (rcu_access_pointer(link->conf->chanctx_conf)) ieee80211_link_unreserve_chanctx(link); ieee80211_vif_unblock_queues_csa(sdata); link->conf->csa_active = false; link->u.mgd.csa.blocked_tx = false; drv_abort_channel_switch(link); } struct sta_csa_rnr_iter_data { struct ieee80211_link_data *link; struct ieee80211_channel *chan; u8 mld_id; }; static enum cfg80211_rnr_iter_ret ieee80211_sta_csa_rnr_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { struct sta_csa_rnr_iter_data *data = _data; struct ieee80211_link_data *link = data->link; struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; const struct ieee80211_tbtt_info_ge_11 *ti; enum nl80211_band band; unsigned int center_freq; int link_id; if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) return RNR_ITER_CONTINUE; if (tbtt_info_len < sizeof(*ti)) return RNR_ITER_CONTINUE; ti = (const void *)tbtt_info; if (ti->mld_params.mld_id != data->mld_id) return RNR_ITER_CONTINUE; link_id = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_LINK_ID); if (link_id != data->link->link_id) return RNR_ITER_CONTINUE; /* we found the entry for our link! */ /* this AP is confused, it had this right before ... just disconnect */ if (!ieee80211_operating_class_to_band(info->op_class, &band)) { link_info(link, "AP now has invalid operating class in RNR, disconnect\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return RNR_ITER_BREAK; } center_freq = ieee80211_channel_to_frequency(info->channel, band); data->chan = ieee80211_get_channel(sdata->local->hw.wiphy, center_freq); return RNR_ITER_BREAK; } static void ieee80211_sta_other_link_csa_disappeared(struct ieee80211_link_data *link, struct ieee802_11_elems *elems) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct sta_csa_rnr_iter_data data = { .link = link, }; /* * If we get here, we see a beacon from another link without * CSA still being reported for it, so now we have to check * if the CSA was aborted or completed. This may not even be * perfectly possible if the CSA was only done for changing * the puncturing, but in that case if the link in inactive * we don't really care, and if it's an active link (or when * it's activated later) we'll get a beacon and adjust. */ if (WARN_ON(!elems->ml_basic)) return; data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); /* * So in order to do this, iterate the RNR element(s) and see * what channel is reported now. */ cfg80211_iter_rnr(elems->ie_start, elems->total_len, ieee80211_sta_csa_rnr_iter, &data); if (!data.chan) { link_info(link, "couldn't find (valid) channel in RNR for CSA, disconnect\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } /* * If it doesn't match the CSA, then assume it aborted. This * may erroneously detect that it was _not_ aborted when it * was in fact aborted, but only changed the bandwidth or the * puncturing configuration, but we don't have enough data to * detect that. */ if (data.chan != link->csa.chanreq.oper.chan) ieee80211_sta_abort_chanswitch(link); } enum ieee80211_csa_source { IEEE80211_CSA_SOURCE_BEACON, IEEE80211_CSA_SOURCE_OTHER_LINK, IEEE80211_CSA_SOURCE_PROT_ACTION, IEEE80211_CSA_SOURCE_UNPROT_ACTION, }; static void ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, u64 timestamp, u32 device_timestamp, struct ieee802_11_elems *full_elems, struct ieee802_11_elems *csa_elems, enum ieee80211_csa_source source) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_chanctx *chanctx = NULL; struct ieee80211_chanctx_conf *conf; struct ieee80211_csa_ie csa_ie = {}; struct ieee80211_channel_switch ch_switch = { .link_id = link->link_id, .timestamp = timestamp, .device_timestamp = device_timestamp, }; unsigned long now; int res; lockdep_assert_wiphy(local->hw.wiphy); if (csa_elems) { struct cfg80211_bss *cbss = link->conf->bss; enum nl80211_band current_band; struct ieee80211_bss *bss; if (WARN_ON(!cbss)) return; current_band = cbss->channel->band; bss = (void *)cbss->priv; res = ieee80211_parse_ch_switch_ie(sdata, csa_elems, current_band, bss->vht_cap_info, &link->u.mgd.conn, link->u.mgd.bssid, source == IEEE80211_CSA_SOURCE_UNPROT_ACTION, &csa_ie); if (res == 0) { ch_switch.block_tx = csa_ie.mode; ch_switch.chandef = csa_ie.chanreq.oper; ch_switch.count = csa_ie.count; ch_switch.delay = csa_ie.max_switch_time; } link->u.mgd.csa.tpe = csa_elems->csa_tpe; } else { /* * If there was no per-STA profile for this link, we * get called with csa_elems == NULL. This of course means * there are no CSA elements, so set res=1 indicating * no more CSA. */ res = 1; } if (res < 0) { /* ignore this case, not a protected frame */ if (source == IEEE80211_CSA_SOURCE_UNPROT_ACTION) return; goto drop_connection; } if (link->conf->csa_active) { switch (source) { case IEEE80211_CSA_SOURCE_PROT_ACTION: case IEEE80211_CSA_SOURCE_UNPROT_ACTION: /* already processing - disregard action frames */ return; case IEEE80211_CSA_SOURCE_BEACON: if (link->u.mgd.csa.waiting_bcn) { ieee80211_chswitch_post_beacon(link); /* * If the CSA is still present after the switch * we need to consider it as a new CSA (possibly * to self). This happens by not returning here * so we'll get to the check below. */ } else if (res) { ieee80211_sta_abort_chanswitch(link); return; } else { drv_channel_switch_rx_beacon(sdata, &ch_switch); return; } break; case IEEE80211_CSA_SOURCE_OTHER_LINK: /* active link: we want to see the beacon to continue */ if (ieee80211_vif_link_active(&sdata->vif, link->link_id)) return; /* switch work ran, so just complete the process */ if (link->u.mgd.csa.waiting_bcn) { ieee80211_chswitch_post_beacon(link); /* * If the CSA is still present after the switch * we need to consider it as a new CSA (possibly * to self). This happens by not returning here * so we'll get to the check below. */ break; } /* link still has CSA but we already know, do nothing */ if (!res) return; /* check in the RNR if the CSA aborted */ ieee80211_sta_other_link_csa_disappeared(link, full_elems); return; } } /* no active CSA nor a new one */ if (res) { /* * However, we may have stopped queues when receiving a public * action frame that couldn't be protected, if it had the quiet * bit set. This is a trade-off, we want to be quiet as soon as * possible, but also don't trust the public action frame much, * as it can't be protected. */ if (unlikely(link->u.mgd.csa.blocked_tx)) { link->u.mgd.csa.blocked_tx = false; ieee80211_vif_unblock_queues_csa(sdata); } return; } /* * We don't really trust public action frames, but block queues (go to * quiet mode) for them anyway, we should get a beacon soon to either * know what the CSA really is, or figure out the public action frame * was actually an attack. */ if (source == IEEE80211_CSA_SOURCE_UNPROT_ACTION) { if (csa_ie.mode) { link->u.mgd.csa.blocked_tx = true; ieee80211_vif_block_queues_csa(sdata); } return; } if (link->conf->chanreq.oper.chan->band != csa_ie.chanreq.oper.chan->band) { link_info(link, "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", link->u.mgd.bssid, csa_ie.chanreq.oper.chan->center_freq, csa_ie.chanreq.oper.width, csa_ie.chanreq.oper.center_freq1, csa_ie.chanreq.oper.center_freq2); goto drop_connection; } if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chanreq.oper, IEEE80211_CHAN_DISABLED)) { link_info(link, "AP %pM switches to unsupported channel (%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), disconnecting\n", link->u.mgd.bssid, csa_ie.chanreq.oper.chan->center_freq, csa_ie.chanreq.oper.chan->freq_offset, csa_ie.chanreq.oper.width, csa_ie.chanreq.oper.center_freq1, csa_ie.chanreq.oper.freq1_offset, csa_ie.chanreq.oper.center_freq2); goto drop_connection; } if (cfg80211_chandef_identical(&csa_ie.chanreq.oper, &link->conf->chanreq.oper) && (!csa_ie.mode || source != IEEE80211_CSA_SOURCE_BEACON)) { if (link->u.mgd.csa.ignored_same_chan) return; link_info(link, "AP %pM tries to chanswitch to same channel, ignore\n", link->u.mgd.bssid); link->u.mgd.csa.ignored_same_chan = true; return; } /* * Drop all TDLS peers on the affected link - either we disconnect or * move to a different channel from this point on. There's no telling * what our peer will do. * The TDLS WIDER_BW scenario is also problematic, as peers might now * have an incompatible wider chandef. */ ieee80211_teardown_tdls_peers(link); conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && !conf) { link_info(link, "no channel context assigned to vif?, disconnecting\n"); goto drop_connection; } if (conf) chanctx = container_of(conf, struct ieee80211_chanctx, conf); if (!ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { link_info(link, "driver doesn't support chan-switch with channel contexts\n"); goto drop_connection; } if (drv_pre_channel_switch(sdata, &ch_switch)) { link_info(link, "preparing for channel switch failed, disconnecting\n"); goto drop_connection; } link->u.mgd.csa.ap_chandef = csa_ie.chanreq.ap; link->csa.chanreq.oper = csa_ie.chanreq.oper; ieee80211_set_chanreq_ap(sdata, &link->csa.chanreq, &link->u.mgd.conn, &csa_ie.chanreq.ap); if (chanctx) { res = ieee80211_link_reserve_chanctx(link, &link->csa.chanreq, chanctx->mode, false); if (res) { link_info(link, "failed to reserve channel context for channel switch, disconnecting (err=%d)\n", res); goto drop_connection; } } link->conf->csa_active = true; link->u.mgd.csa.ignored_same_chan = false; link->u.mgd.beacon_crc_valid = false; link->u.mgd.csa.blocked_tx = csa_ie.mode; if (csa_ie.mode) ieee80211_vif_block_queues_csa(sdata); cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chanreq.oper, link->link_id, csa_ie.count, csa_ie.mode); /* we may have to handle timeout for deactivated link in software */ now = jiffies; link->u.mgd.csa.time = now + TU_TO_JIFFIES((max_t(int, csa_ie.count, 1) - 1) * link->conf->beacon_int); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && local->ops->channel_switch) { /* * Use driver's channel switch callback, the driver will * later call ieee80211_chswitch_done(). It may deactivate * the link as well, we handle that elsewhere and queue * the csa.switch_work for the calculated time then. */ drv_channel_switch(local, sdata, &ch_switch); return; } /* channel switch handled in software */ wiphy_delayed_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - now); return; drop_connection: /* * This is just so that the disconnect flow will know that * we were trying to switch channel and failed. In case the * mode is 1 (we are not allowed to Tx), we will know not to * send a deauthentication frame. Those two fields will be * reset when the disconnection worker runs. */ link->conf->csa_active = true; link->u.mgd.csa.blocked_tx = csa_ie.mode; wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); } struct sta_bss_param_ch_cnt_data { struct ieee80211_sub_if_data *sdata; u8 reporting_link_id; u8 mld_id; }; static enum cfg80211_rnr_iter_ret ieee80211_sta_bss_param_ch_cnt_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { struct sta_bss_param_ch_cnt_data *data = _data; struct ieee80211_sub_if_data *sdata = data->sdata; const struct ieee80211_tbtt_info_ge_11 *ti; u8 bss_param_ch_cnt; int link_id; if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) return RNR_ITER_CONTINUE; if (tbtt_info_len < sizeof(*ti)) return RNR_ITER_CONTINUE; ti = (const void *)tbtt_info; if (ti->mld_params.mld_id != data->mld_id) return RNR_ITER_CONTINUE; link_id = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_LINK_ID); bss_param_ch_cnt = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); if (bss_param_ch_cnt != 255 && link_id < ARRAY_SIZE(sdata->link)) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); if (link && link->conf->bss_param_ch_cnt != bss_param_ch_cnt) { link->conf->bss_param_ch_cnt = bss_param_ch_cnt; link->conf->bss_param_ch_cnt_link_id = data->reporting_link_id; } } return RNR_ITER_CONTINUE; } static void ieee80211_mgd_update_bss_param_ch_cnt(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf, struct ieee802_11_elems *elems) { struct sta_bss_param_ch_cnt_data data = { .reporting_link_id = bss_conf->link_id, .sdata = sdata, }; int bss_param_ch_cnt; if (!elems->ml_basic) return; data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); cfg80211_iter_rnr(elems->ie_start, elems->total_len, ieee80211_sta_bss_param_ch_cnt_iter, &data); bss_param_ch_cnt = ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); /* * Update bss_param_ch_cnt_link_id even if bss_param_ch_cnt * didn't change to indicate that we got a beacon on our own * link. */ if (bss_param_ch_cnt >= 0 && bss_param_ch_cnt != 255) { bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = bss_conf->link_id; } } static bool ieee80211_find_80211h_pwr_constr(struct ieee80211_channel *channel, const u8 *country_ie, u8 country_ie_len, const u8 *pwr_constr_elem, int *chan_pwr, int *pwr_reduction) { struct ieee80211_country_ie_triplet *triplet; int chan = ieee80211_frequency_to_channel(channel->center_freq); int i, chan_increment; bool have_chan_pwr = false; /* Invalid IE */ if (country_ie_len % 2 || country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) return false; triplet = (void *)(country_ie + 3); country_ie_len -= 3; switch (channel->band) { default: WARN_ON_ONCE(1); fallthrough; case NL80211_BAND_2GHZ: case NL80211_BAND_60GHZ: case NL80211_BAND_LC: chan_increment = 1; break; case NL80211_BAND_5GHZ: chan_increment = 4; break; case NL80211_BAND_6GHZ: /* * In the 6 GHz band, the "maximum transmit power level" * field in the triplets is reserved, and thus will be * zero and we shouldn't use it to control TX power. * The actual TX power will be given in the transmit * power envelope element instead. */ return false; } /* find channel */ while (country_ie_len >= 3) { u8 first_channel = triplet->chans.first_channel; if (first_channel >= IEEE80211_COUNTRY_EXTENSION_ID) goto next; for (i = 0; i < triplet->chans.num_channels; i++) { if (first_channel + i * chan_increment == chan) { have_chan_pwr = true; *chan_pwr = triplet->chans.max_power; break; } } if (have_chan_pwr) break; next: triplet++; country_ie_len -= 3; } if (have_chan_pwr && pwr_constr_elem) *pwr_reduction = *pwr_constr_elem; else *pwr_reduction = 0; return have_chan_pwr; } static void ieee80211_find_cisco_dtpc(struct ieee80211_channel *channel, const u8 *cisco_dtpc_ie, int *pwr_level) { /* From practical testing, the first data byte of the DTPC element * seems to contain the requested dBm level, and the CLI on Cisco * APs clearly state the range is -127 to 127 dBm, which indicates * a signed byte, although it seemingly never actually goes negative. * The other byte seems to always be zero. */ *pwr_level = (__s8)cisco_dtpc_ie[4]; } static u64 ieee80211_handle_pwr_constr(struct ieee80211_link_data *link, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, const u8 *country_ie, u8 country_ie_len, const u8 *pwr_constr_ie, const u8 *cisco_dtpc_ie) { struct ieee80211_sub_if_data *sdata = link->sdata; bool has_80211h_pwr = false, has_cisco_pwr = false; int chan_pwr = 0, pwr_reduction_80211h = 0; int pwr_level_cisco, pwr_level_80211h; int new_ap_level; __le16 capab = mgmt->u.probe_resp.capab_info; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) return 0; /* TODO */ if (country_ie && (capab & cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT) || capab & cpu_to_le16(WLAN_CAPABILITY_RADIO_MEASURE))) { has_80211h_pwr = ieee80211_find_80211h_pwr_constr( channel, country_ie, country_ie_len, pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); pwr_level_80211h = max_t(int, 0, chan_pwr - pwr_reduction_80211h); } if (cisco_dtpc_ie) { ieee80211_find_cisco_dtpc( channel, cisco_dtpc_ie, &pwr_level_cisco); has_cisco_pwr = true; } if (!has_80211h_pwr && !has_cisco_pwr) return 0; /* If we have both 802.11h and Cisco DTPC, apply both limits * by picking the smallest of the two power levels advertised. */ if (has_80211h_pwr && (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { new_ap_level = pwr_level_80211h; if (link->ap_power_level == new_ap_level) return 0; sdata_dbg(sdata, "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", pwr_level_80211h, chan_pwr, pwr_reduction_80211h, link->u.mgd.bssid); } else { /* has_cisco_pwr is always true here. */ new_ap_level = pwr_level_cisco; if (link->ap_power_level == new_ap_level) return 0; sdata_dbg(sdata, "Limiting TX power to %d dBm as advertised by %pM\n", pwr_level_cisco, link->u.mgd.bssid); } link->ap_power_level = new_ap_level; if (__ieee80211_recalc_txpower(link)) return BSS_CHANGED_TXPOWER; return 0; } /* powersave */ static void ieee80211_enable_ps(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_conf *conf = &local->hw.conf; /* * If we are scanning right now then the parameters will * take effect when scan finishes. */ if (local->scanning) return; if (conf->dynamic_ps_timeout > 0 && !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(conf->dynamic_ps_timeout)); } else { if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) ieee80211_send_nullfunc(local, sdata, true); if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) return; conf->flags |= IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } static void ieee80211_change_ps(struct ieee80211_local *local) { struct ieee80211_conf *conf = &local->hw.conf; if (local->ps_sdata) { ieee80211_enable_ps(local, local->ps_sdata); } else if (conf->flags & IEEE80211_CONF_PS) { conf->flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); } } static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *mgd = &sdata->u.mgd; struct sta_info *sta = NULL; bool authorized = false; if (!mgd->powersave) return false; if (mgd->broken_ap) return false; if (!mgd->associated) return false; if (mgd->flags & IEEE80211_STA_CONNECTION_POLL) return false; if (!(local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) && !sdata->deflink.u.mgd.have_beacon) return false; rcu_read_lock(); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (sta) authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); rcu_read_unlock(); return authorized; } /* need to hold RTNL or interface lock */ void ieee80211_recalc_ps(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *found = NULL; int count = 0; int timeout; if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS) || ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { local->ps_sdata = NULL; return; } list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_AP) { /* If an AP vif is found, then disable PS * by setting the count to zero thereby setting * ps_sdata to NULL. */ count = 0; break; } if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; found = sdata; count++; } if (count == 1 && ieee80211_powersave_allowed(found)) { u8 dtimper = found->deflink.u.mgd.dtim_period; timeout = local->dynamic_ps_forced_timeout; if (timeout < 0) timeout = 100; local->hw.conf.dynamic_ps_timeout = timeout; /* If the TIM IE is invalid, pretend the value is 1 */ if (!dtimper) dtimper = 1; local->hw.conf.ps_dtim_period = dtimper; local->ps_sdata = found; } else { local->ps_sdata = NULL; } ieee80211_change_ps(local); } void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata) { bool ps_allowed = ieee80211_powersave_allowed(sdata); if (sdata->vif.cfg.ps != ps_allowed) { sdata->vif.cfg.ps = ps_allowed; ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_PS); } } void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, dynamic_ps_disable_work); if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_PS, false); } void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, dynamic_ps_enable_work); struct ieee80211_sub_if_data *sdata = local->ps_sdata; struct ieee80211_if_managed *ifmgd; unsigned long flags; int q; /* can only happen when PS was just disabled anyway */ if (!sdata) return; ifmgd = &sdata->u.mgd; if (local->hw.conf.flags & IEEE80211_CONF_PS) return; if (local->hw.conf.dynamic_ps_timeout > 0) { /* don't enter PS if TX frames are pending */ if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); return; } /* * transmission can be stopped by others which leads to * dynamic_ps_timer expiry. Postpone the ps timer if it * is not the actual idle state. */ spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (q = 0; q < local->hw.queues; q++) { if (local->queue_stop_reasons[q]) { spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); return; } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && !(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); } else { ieee80211_send_nullfunc(local, sdata, true); /* Flush to get the tx status of nullfunc frame */ ieee80211_flush_queues(local, sdata, false); } } if (!(ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) || (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } void ieee80211_dynamic_ps_timer(struct timer_list *t) { struct ieee80211_local *local = timer_container_of(local, t, dynamic_ps_timer); wiphy_work_queue(local->hw.wiphy, &local->dynamic_ps_enable_work); } void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, dfs_cac_timer_work.work); struct cfg80211_chan_def chandef = link->conf->chanreq.oper; struct ieee80211_sub_if_data *sdata = link->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (sdata->wdev.links[link->link_id].cac_started) { ieee80211_link_release_channel(link); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_FINISHED, GFP_KERNEL, link->link_id); } } static bool __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool ret = false; int ac; if (local->hw.queues < IEEE80211_NUM_ACS) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; int non_acm_ac; unsigned long now = jiffies; if (tx_tspec->action == TX_TSPEC_ACTION_NONE && tx_tspec->admitted_time && time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->consumed_tx_time = 0; tx_tspec->time_slice_start = now; if (tx_tspec->downgraded) tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; } switch (tx_tspec->action) { case TX_TSPEC_ACTION_STOP_DOWNGRADE: /* take the original parameters */ if (drv_conf_tx(local, &sdata->deflink, ac, &sdata->deflink.tx_conf[ac])) link_err(&sdata->deflink, "failed to set TX queue parameters for queue %d\n", ac); tx_tspec->action = TX_TSPEC_ACTION_NONE; tx_tspec->downgraded = false; ret = true; break; case TX_TSPEC_ACTION_DOWNGRADE: if (time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->action = TX_TSPEC_ACTION_NONE; ret = true; break; } /* downgrade next lower non-ACM AC */ for (non_acm_ac = ac + 1; non_acm_ac < IEEE80211_NUM_ACS; non_acm_ac++) if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac))) break; /* Usually the loop will result in using BK even if it * requires admission control, but such a configuration * makes no sense and we have to transmit somehow - the * AC selection does the same thing. * If we started out trying to downgrade from BK, then * the extra condition here might be needed. */ if (non_acm_ac >= IEEE80211_NUM_ACS) non_acm_ac = IEEE80211_AC_BK; if (drv_conf_tx(local, &sdata->deflink, ac, &sdata->deflink.tx_conf[non_acm_ac])) link_err(&sdata->deflink, "failed to set TX queue parameters for queue %d\n", ac); tx_tspec->action = TX_TSPEC_ACTION_NONE; ret = true; wiphy_delayed_work_queue(local->hw.wiphy, &ifmgd->tx_tspec_wk, tx_tspec->time_slice_start + HZ - now + 1); break; case TX_TSPEC_ACTION_NONE: /* nothing now */ break; } } return ret; } void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata) { if (__ieee80211_sta_handle_tspec_ac_params(sdata)) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_QOS); } static void ieee80211_sta_handle_tspec_ac_params_wk(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata; sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.tx_tspec_wk.work); ieee80211_sta_handle_tspec_ac_params(sdata); } void ieee80211_mgd_set_link_qos_params(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_tx_queue_params *params = link->tx_conf; u8 ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { mlme_dbg(sdata, "WMM AC=%d acm=%d aifs=%d cWmin=%d cWmax=%d txop=%d uapsd=%d, downgraded=%d\n", ac, params[ac].acm, params[ac].aifs, params[ac].cw_min, params[ac].cw_max, params[ac].txop, params[ac].uapsd, ifmgd->tx_tspec[ac].downgraded); if (!ifmgd->tx_tspec[ac].downgraded && drv_conf_tx(local, link, ac, &params[ac])) link_err(link, "failed to set TX queue parameters for AC %d\n", ac); } } /* MLME */ static bool _ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_link_data *link, const u8 *wmm_param, size_t wmm_param_len, const struct ieee80211_mu_edca_param_set *mu_edca) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t left; int count, mu_edca_count, ac; const u8 *pos; u8 uapsd_queues = 0; if (!local->ops->conf_tx) return false; if (local->hw.queues < IEEE80211_NUM_ACS) return false; if (!wmm_param) return false; if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) return false; if (ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) uapsd_queues = ifmgd->uapsd_queues; count = wmm_param[6] & 0x0f; /* -1 is the initial value of ifmgd->mu_edca_last_param_set. * if mu_edca was preset before and now it disappeared tell * the driver about it. */ mu_edca_count = mu_edca ? mu_edca->mu_qos_info & 0x0f : -1; if (count == link->u.mgd.wmm_last_param_set && mu_edca_count == link->u.mgd.mu_edca_last_param_set) return false; link->u.mgd.wmm_last_param_set = count; link->u.mgd.mu_edca_last_param_set = mu_edca_count; pos = wmm_param + 8; left = wmm_param_len - 8; memset(&params, 0, sizeof(params)); sdata->wmm_acm = 0; for (; left >= 4; left -= 4, pos += 4) { int aci = (pos[0] >> 5) & 0x03; int acm = (pos[0] >> 4) & 0x01; bool uapsd = false; switch (aci) { case 1: /* AC_BK */ ac = IEEE80211_AC_BK; if (acm) sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_bk; break; case 2: /* AC_VI */ ac = IEEE80211_AC_VI; if (acm) sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_vi; break; case 3: /* AC_VO */ ac = IEEE80211_AC_VO; if (acm) sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_vo; break; case 0: /* AC_BE */ default: ac = IEEE80211_AC_BE; if (acm) sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_be; break; } params[ac].aifs = pos[0] & 0x0f; if (params[ac].aifs < 2) { link_info(link, "AP has invalid WMM params (AIFSN=%d for ACI %d), will use 2\n", params[ac].aifs, aci); params[ac].aifs = 2; } params[ac].cw_max = ecw2cw((pos[1] & 0xf0) >> 4); params[ac].cw_min = ecw2cw(pos[1] & 0x0f); params[ac].txop = get_unaligned_le16(pos + 2); params[ac].acm = acm; params[ac].uapsd = uapsd; if (params[ac].cw_min == 0 || params[ac].cw_min > params[ac].cw_max) { link_info(link, "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n", params[ac].cw_min, params[ac].cw_max, aci); return false; } ieee80211_regulatory_limit_wmm_params(sdata, &params[ac], ac); } /* WMM specification requires all 4 ACIs. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (params[ac].cw_min == 0) { link_info(link, "AP has invalid WMM params (missing AC %d), using defaults\n", ac); return false; } } for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) link->tx_conf[ac] = params[ac]; return true; } static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_link_data *link, const u8 *wmm_param, size_t wmm_param_len, const struct ieee80211_mu_edca_param_set *mu_edca) { if (!_ieee80211_sta_wmm_params(local, link, wmm_param, wmm_param_len, mu_edca)) return false; ieee80211_mgd_set_link_qos_params(link); /* enable WMM or activate new settings */ link->conf->qos = true; return true; } static void __ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata) { lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.flags &= ~IEEE80211_STA_CONNECTION_POLL; ieee80211_run_deferred_scan(sdata->local); } static void ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata) { lockdep_assert_wiphy(sdata->local->hw.wiphy); __ieee80211_stop_poll(sdata); } static u64 ieee80211_handle_bss_capability(struct ieee80211_link_data *link, u16 capab, bool erp_valid, u8 erp) { struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_supported_band *sband; u64 changed = 0; bool use_protection; bool use_short_preamble; bool use_short_slot; sband = ieee80211_get_link_sband(link); if (!sband) return changed; if (erp_valid) { use_protection = (erp & WLAN_ERP_USE_PROTECTION) != 0; use_short_preamble = (erp & WLAN_ERP_BARKER_PREAMBLE) == 0; } else { use_protection = false; use_short_preamble = !!(capab & WLAN_CAPABILITY_SHORT_PREAMBLE); } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); if (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { bss_conf->use_cts_prot = use_protection; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (use_short_preamble != bss_conf->use_short_preamble) { bss_conf->use_short_preamble = use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } if (use_short_slot != bss_conf->use_short_slot) { bss_conf->use_short_slot = use_short_slot; changed |= BSS_CHANGED_ERP_SLOT; } return changed; } static u64 ieee80211_link_set_associated(struct ieee80211_link_data *link, struct cfg80211_bss *cbss) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_bss *bss = (void *)cbss->priv; u64 changed = BSS_CHANGED_QOS; /* not really used in MLO */ sdata->u.mgd.beacon_timeout = usecs_to_jiffies(ieee80211_tu_to_usec(beacon_loss_count * bss_conf->beacon_int)); changed |= ieee80211_handle_bss_capability(link, bss_conf->assoc_capability, bss->has_erp_value, bss->erp_value); ieee80211_check_rate_mask(link); link->conf->bss = cbss; memcpy(link->u.mgd.bssid, cbss->bssid, ETH_ALEN); if (sdata->vif.p2p || sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) { int ret; ret = cfg80211_get_p2p_attr( ies->data, ies->len, IEEE80211_P2P_ATTR_ABSENCE_NOTICE, (u8 *) &bss_conf->p2p_noa_attr, sizeof(bss_conf->p2p_noa_attr)); if (ret >= 2) { link->u.mgd.p2p_noa_index = bss_conf->p2p_noa_attr.index; changed |= BSS_CHANGED_P2P_PS; } } rcu_read_unlock(); } if (link->u.mgd.have_beacon) { bss_conf->beacon_rate = bss->beacon_rate; changed |= BSS_CHANGED_BEACON_INFO; } else { bss_conf->beacon_rate = NULL; } /* Tell the driver to monitor connection quality (if supported) */ if (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI && bss_conf->cqm_rssi_thold) changed |= BSS_CHANGED_CQM; return changed; } static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data, u64 changed[IEEE80211_MLD_MAX_NUM_LINKS]) { struct ieee80211_local *local = sdata->local; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; u64 vif_changed = BSS_CHANGED_ASSOC; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); sdata->u.mgd.associated = true; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; if (!cbss || assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; if (ieee80211_vif_is_mld(&sdata->vif) && !(ieee80211_vif_usable_links(&sdata->vif) & BIT(link_id))) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) return; changed[link_id] |= ieee80211_link_set_associated(link, cbss); } /* just to be sure */ ieee80211_stop_poll(sdata); ieee80211_led_assoc(local, 1); vif_cfg->assoc = 1; /* Enable ARP filtering */ if (vif_cfg->arp_addr_cnt) vif_changed |= BSS_CHANGED_ARP_FILTER; if (ieee80211_vif_is_mld(&sdata->vif)) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link; struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; if (!cbss || !(BIT(link_id) & ieee80211_vif_usable_links(&sdata->vif)) || assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) return; ieee80211_link_info_change_notify(sdata, link, changed[link_id]); ieee80211_recalc_smps(sdata, link); } ieee80211_vif_cfg_change_notify(sdata, vif_changed); } else { ieee80211_bss_info_change_notify(sdata, vif_changed | changed[0]); } ieee80211_recalc_ps(local); /* leave this here to not change ordering in non-MLO cases */ if (!ieee80211_vif_is_mld(&sdata->vif)) ieee80211_recalc_smps(sdata, &sdata->deflink); ieee80211_recalc_ps_vif(sdata); netif_carrier_on(sdata->dev); } static void ieee80211_ml_reconf_reset(struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgd_assoc_data *add_links_data = sdata->u.mgd.reconf.add_links_data; if (!ieee80211_vif_is_mld(&sdata->vif) || !(sdata->u.mgd.reconf.added_links | sdata->u.mgd.reconf.removed_links)) return; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.reconf.wk); sdata->u.mgd.reconf.added_links = 0; sdata->u.mgd.reconf.removed_links = 0; sdata->u.mgd.reconf.dialog_token = 0; if (add_links_data) { struct cfg80211_mlo_reconf_done_data done_data = {}; u8 link_id; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) done_data.links[link_id].bss = add_links_data->link[link_id].bss; cfg80211_mlo_reconf_add_done(sdata->dev, &done_data); kfree(sdata->u.mgd.reconf.add_links_data); sdata->u.mgd.reconf.add_links_data = NULL; } } static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, u16 stype, u16 reason, bool tx, u8 *frame_buf) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sta_info *ap_sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); unsigned int link_id; u64 changed = 0; struct ieee80211_prep_tx_info info = { .subtype = stype, .was_assoc = true, .link_id = ffs(sdata->vif.active_links) - 1, }; lockdep_assert_wiphy(local->hw.wiphy); if (frame_buf) memset(frame_buf, 0, IEEE80211_DEAUTH_FRAME_LEN); if (WARN_ON(!ap_sta)) return; if (WARN_ON_ONCE(tx && !frame_buf)) return; if (WARN_ON(!ifmgd->associated)) return; ieee80211_stop_poll(sdata); ifmgd->associated = false; if (tx) { bool tx_link_found = false; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; if (!ieee80211_vif_link_active(&sdata->vif, link_id)) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON_ONCE(!link)) continue; if (link->u.mgd.csa.blocked_tx) continue; tx_link_found = true; break; } tx = tx_link_found; } /* other links will be destroyed */ sdata->deflink.conf->bss = NULL; sdata->deflink.conf->epcs_support = false; sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; netif_carrier_off(sdata->dev); /* * if we want to get out of ps before disassoc (why?) we have * to do it before sending disassoc, as otherwise the null-packet * won't be valid. */ if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } local->ps_sdata = NULL; /* disable per-vif ps */ ieee80211_recalc_ps_vif(sdata); /* make sure ongoing transmission finishes */ synchronize_net(); /* * drop any frame before deauth/disassoc, this can be data or * management frame. Since we are disconnecting, we should not * insist sending these frames which can take time and delay * the disconnection and possible the roaming. */ ieee80211_flush_queues(local, sdata, true); if (tx) { drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr, sdata->vif.cfg.ap_addr, stype, reason, true, frame_buf); /* flush out frame - make sure the deauth was actually sent */ ieee80211_flush_queues(local, sdata, false); drv_mgd_complete_tx(sdata->local, sdata, &info); } else if (frame_buf) { ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr, sdata->vif.cfg.ap_addr, stype, reason, false, frame_buf); } /* clear AP addr only after building the needed mgmt frames */ eth_zero_addr(sdata->deflink.u.mgd.bssid); eth_zero_addr(sdata->vif.cfg.ap_addr); sdata->vif.cfg.ssid_len = 0; /* Remove TDLS peers */ __sta_info_flush(sdata, false, -1, ap_sta); if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) { /* Only move the AP state */ sta_info_move_state(ap_sta, IEEE80211_STA_NONE); } else { /* Remove AP peer */ sta_info_flush(sdata, -1); } /* finally reset all BSS / config parameters */ if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= ieee80211_reset_erp_info(sdata); ieee80211_led_assoc(local, 0); changed |= BSS_CHANGED_ASSOC; sdata->vif.cfg.assoc = false; sdata->deflink.u.mgd.p2p_noa_index = -1; memset(&sdata->vif.bss_conf.p2p_noa_attr, 0, sizeof(sdata->vif.bss_conf.p2p_noa_attr)); /* on the next assoc, re-program HT/VHT parameters */ memset(&ifmgd->ht_capa, 0, sizeof(ifmgd->ht_capa)); memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask)); memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa)); memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask)); /* * reset MU-MIMO ownership and group data in default link, * if used, other links are destroyed */ memset(sdata->vif.bss_conf.mu_group.membership, 0, sizeof(sdata->vif.bss_conf.mu_group.membership)); memset(sdata->vif.bss_conf.mu_group.position, 0, sizeof(sdata->vif.bss_conf.mu_group.position)); if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= BSS_CHANGED_MU_GROUPS; sdata->vif.bss_conf.mu_mimo_owner = false; sdata->deflink.ap_power_level = IEEE80211_UNSET_POWER_LEVEL; timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); /* Disable ARP filtering */ if (sdata->vif.cfg.arp_addr_cnt) changed |= BSS_CHANGED_ARP_FILTER; sdata->vif.bss_conf.qos = false; if (!ieee80211_vif_is_mld(&sdata->vif)) { changed |= BSS_CHANGED_QOS; /* The BSSID (not really interesting) and HT changed */ changed |= BSS_CHANGED_BSSID | BSS_CHANGED_HT; ieee80211_bss_info_change_notify(sdata, changed); } else { ieee80211_vif_cfg_change_notify(sdata, changed); } if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) { /* * After notifying the driver about the disassoc, * remove the ap sta. */ sta_info_flush(sdata, -1); } /* disassociated - set to defaults now */ ieee80211_set_wmm_default(&sdata->deflink, false, false); timer_delete_sync(&sdata->u.mgd.conn_mon_timer); timer_delete_sync(&sdata->u.mgd.bcn_mon_timer); timer_delete_sync(&sdata->u.mgd.timer); sdata->vif.bss_conf.dtim_period = 0; sdata->vif.bss_conf.beacon_rate = NULL; sdata->deflink.u.mgd.have_beacon = false; sdata->deflink.u.mgd.tracking_signal_avg = false; sdata->deflink.u.mgd.disable_wmm_tracking = false; ifmgd->flags = 0; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; ieee80211_link_release_channel(link); } sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa.blocked_tx = false; sdata->deflink.u.mgd.csa.waiting_bcn = false; sdata->deflink.u.mgd.csa.ignored_same_chan = false; ieee80211_vif_unblock_queues_csa(sdata); /* existing TX TSPEC sessions no longer exist */ memset(ifmgd->tx_tspec, 0, sizeof(ifmgd->tx_tspec)); wiphy_delayed_work_cancel(local->hw.wiphy, &ifmgd->tx_tspec_wk); sdata->vif.bss_conf.power_type = IEEE80211_REG_UNSET_AP; sdata->vif.bss_conf.pwr_reduction = 0; ieee80211_clear_tpe(&sdata->vif.bss_conf.tpe); sdata->vif.cfg.eml_cap = 0; sdata->vif.cfg.eml_med_sync_delay = 0; sdata->vif.cfg.mld_capa_op = 0; memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->neg_ttlm_timeout_work); sdata->u.mgd.removed_links = 0; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->teardown_ttlm_work); /* if disconnection happens in the middle of the ML reconfiguration * flow, cfg80211 must called to release the BSS references obtained * when the flow started. */ ieee80211_ml_reconf_reset(sdata); ieee80211_vif_set_links(sdata, 0, 0); ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; ifmgd->epcs.enabled = false; ifmgd->epcs.dialog_token = 0; memset(ifmgd->userspace_selectors, 0, sizeof(ifmgd->userspace_selectors)); } static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (!(ifmgd->flags & IEEE80211_STA_CONNECTION_POLL)) return; __ieee80211_stop_poll(sdata); ieee80211_recalc_ps(local); if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; /* * We've received a probe response, but are not sure whether * we have or will be receiving any beacons or data, so let's * schedule the timers again, just in case. */ ieee80211_sta_reset_beacon_monitor(sdata); mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); } static void ieee80211_sta_tx_wmm_ac_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, u16 tx_time) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 tid; int ac; struct ieee80211_sta_tx_tspec *tx_tspec; unsigned long now = jiffies; if (!ieee80211_is_data_qos(hdr->frame_control)) return; tid = ieee80211_get_tid(hdr); ac = ieee80211_ac_from_tid(tid); tx_tspec = &ifmgd->tx_tspec[ac]; if (likely(!tx_tspec->admitted_time)) return; if (time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->consumed_tx_time = 0; tx_tspec->time_slice_start = now; if (tx_tspec->downgraded) { tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &ifmgd->tx_tspec_wk, 0); } } if (tx_tspec->downgraded) return; tx_tspec->consumed_tx_time += tx_time; if (tx_tspec->consumed_tx_time >= tx_tspec->admitted_time) { tx_tspec->downgraded = true; tx_tspec->action = TX_TSPEC_ACTION_DOWNGRADE; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &ifmgd->tx_tspec_wk, 0); } } void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time) { ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time); if (!ieee80211_is_any_nullfunc(hdr->frame_control) || !sdata->u.mgd.probe_send_count) return; if (ack) sdata->u.mgd.probe_send_count = 0; else sdata->u.mgd.nullfunc_failed = true; wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel, ssid, ssid_len, NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); if (skb) ieee80211_tx_skb(sdata, skb); } static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 *dst = sdata->vif.cfg.ap_addr; u8 unicast_limit = max(1, max_probe_tries - 3); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * Try sending broadcast probe requests for the last three * probe requests after the first ones failed since some * buggy APs only support broadcast probe requests. */ if (ifmgd->probe_send_count >= unicast_limit) dst = NULL; /* * When the hardware reports an accurate Tx ACK status, it's * better to send a nullfunc frame instead of a probe request, * as it will kick us off the AP quickly if we aren't associated * anymore. The timeout will be reset if the frame is ACKed by * the AP. */ ifmgd->probe_send_count++; if (dst) { sta = sta_info_get(sdata, dst); if (!WARN_ON(!sta)) ieee80211_check_fast_rx(sta); } if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; ieee80211_send_nullfunc(sdata->local, sdata, false); } else { ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst, sdata->vif.cfg.ssid, sdata->vif.cfg.ssid_len, sdata->deflink.conf->bss->channel); } ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms); run_again(sdata, ifmgd->probe_timeout); } static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, bool beacon) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool already = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!ieee80211_sdata_running(sdata)) return; if (!ifmgd->associated) return; if (sdata->local->tmp_channel || sdata->local->scanning) return; if (sdata->local->suspending) { /* reschedule after resume */ ieee80211_reset_ap_probe(sdata); return; } if (beacon) { mlme_dbg_ratelimited(sdata, "detected beacon loss from AP (missed %d beacons) - probing\n", beacon_loss_count); ieee80211_cqm_beacon_loss_notify(&sdata->vif, GFP_KERNEL); } /* * The driver/our work has already reported this event or the * connection monitoring has kicked in and we have already sent * a probe request. Or maybe the AP died and the driver keeps * reporting until we disassociate... * * In either case we have to ignore the current call to this * function (except for setting the correct probe reason bit) * because otherwise we would reset the timer every time and * never check whether we received a probe response! */ if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) already = true; ifmgd->flags |= IEEE80211_STA_CONNECTION_POLL; if (already) return; ieee80211_recalc_ps(sdata->local); ifmgd->probe_send_count = 0; ieee80211_mgd_probe_ap_send(sdata); } struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct cfg80211_bss *cbss; struct sk_buff *skb; const struct element *ssid; int ssid_len; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION || ieee80211_vif_is_mld(&sdata->vif))) return NULL; if (ifmgd->associated) cbss = sdata->deflink.conf->bss; else if (ifmgd->auth_data) cbss = ifmgd->auth_data->bss; else if (ifmgd->assoc_data && ifmgd->assoc_data->link[0].bss) cbss = ifmgd->assoc_data->link[0].bss; else return NULL; rcu_read_lock(); ssid = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); if (WARN_ONCE(!ssid || ssid->datalen > IEEE80211_MAX_SSID_LEN, "invalid SSID element (len=%d)", ssid ? ssid->datalen : -1)) ssid_len = 0; else ssid_len = ssid->datalen; skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid, (u32) -1, cbss->channel, ssid->data, ssid_len, NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_ap_probereq_get); static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *buf, size_t len, bool tx, u16 reason, bool reconnect) { struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = tx ? DEAUTH_TX_EVENT : DEAUTH_RX_EVENT, .u.mlme.reason = reason, }; if (tx) cfg80211_tx_mlme_mgmt(sdata->dev, buf, len, reconnect); else cfg80211_rx_mlme_mgmt(sdata->dev, buf, len); drv_event_callback(sdata->local, sdata, &event); } static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; if (!ifmgd->driver_disconnect) { unsigned int link_id; /* * AP is probably out of range (or not reachable for another * reason) so remove the bss structs for that AP. In the case * of multi-link, it's not clear that all of them really are * out of range, but if they weren't the driver likely would * have switched to just have a single link active? */ for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link || !link->conf->bss) continue; cfg80211_unlink_bss(local->hw.wiphy, link->conf->bss); link->conf->bss = NULL; } } ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, ifmgd->driver_disconnect ? WLAN_REASON_DEAUTH_LEAVING : WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, true, frame_buf); /* the other links will be destroyed */ sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa.waiting_bcn = false; sdata->deflink.u.mgd.csa.blocked_tx = false; ieee80211_vif_unblock_queues_csa(sdata); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, ifmgd->reconnect); ifmgd->reconnect = false; } static void ieee80211_beacon_connection_loss_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.beacon_connection_loss_work); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (ifmgd->connection_loss) { sdata_info(sdata, "Connection to AP %pM lost\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); ifmgd->connection_loss = false; } else if (ifmgd->driver_disconnect) { sdata_info(sdata, "Driver requested disconnection from AP %pM\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); ifmgd->driver_disconnect = false; } else { if (ifmgd->associated) sdata->deflink.u.mgd.beacon_loss_count++; ieee80211_mgd_probe_ap(sdata, true); } } static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.csa_connection_drop_work); __ieee80211_disconnect(sdata); } void ieee80211_beacon_loss(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_hw *hw = &sdata->local->hw; trace_api_beacon_loss(sdata); sdata->u.mgd.connection_loss = false; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_beacon_loss); void ieee80211_connection_loss(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata; struct ieee80211_hw *hw; KUNIT_STATIC_STUB_REDIRECT(ieee80211_connection_loss, vif); sdata = vif_to_sdata(vif); hw = &sdata->local->hw; trace_api_connection_loss(sdata); sdata->u.mgd.connection_loss = true; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_connection_loss); void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_hw *hw = &sdata->local->hw; trace_api_disconnect(sdata, reconnect); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; sdata->u.mgd.driver_disconnect = true; sdata->u.mgd.reconnect = reconnect; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_disconnect); static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, bool assoc) { struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.auth_data = NULL; if (!assoc) { /* * we are not authenticated yet, the only timer that could be * running is the timeout for the authentication response which * which is not relevant anymore. */ timer_delete_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, auth_data->ap_addr); /* other links are destroyed */ eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0, 0); } cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss); kfree(auth_data); } enum assoc_status { ASSOC_SUCCESS, ASSOC_REJECTED, ASSOC_TIMEOUT, ASSOC_ABANDON, }; static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, enum assoc_status status) { struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.assoc_data = NULL; if (status != ASSOC_SUCCESS) { /* * we are not associated yet, the only timer that could be * running is the timeout for the association response which * which is not relevant anymore. */ timer_delete_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, assoc_data->ap_addr); eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; sdata->vif.bss_conf.mu_mimo_owner = false; if (status != ASSOC_REJECTED) { struct cfg80211_assoc_failure data = { .timeout = status == ASSOC_TIMEOUT, }; int i; BUILD_BUG_ON(ARRAY_SIZE(data.bss) != ARRAY_SIZE(assoc_data->link)); for (i = 0; i < ARRAY_SIZE(data.bss); i++) data.bss[i] = assoc_data->link[i].bss; if (ieee80211_vif_is_mld(&sdata->vif)) data.ap_mld_addr = assoc_data->ap_addr; cfg80211_assoc_failure(sdata->dev, &data); } ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0, 0); } kfree(assoc_data); } static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; const struct element *challenge; u8 *pos; u32 tx_flags = 0; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, .link_id = auth_data->link_id, }; pos = mgmt->u.auth.variable; challenge = cfg80211_find_elem(WLAN_EID_CHALLENGE, pos, len - (pos - (u8 *)mgmt)); if (!challenge) return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata, &info); if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, (void *)challenge, challenge->datalen + sizeof(*challenge), auth_data->ap_addr, auth_data->ap_addr, auth_data->key, auth_data->key_len, auth_data->key_idx, tx_flags); } static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; const u8 *ap_addr = ifmgd->auth_data->ap_addr; struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata_info(sdata, "authenticated\n"); ifmgd->auth_data->done = true; ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC; ifmgd->auth_data->timeout_started = true; run_again(sdata, ifmgd->auth_data->timeout); /* move station state to auth */ sta = sta_info_get(sdata, ap_addr); if (!sta) { WARN_ONCE(1, "%s: STA %pM not found", sdata->name, ap_addr); return false; } if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) { sdata_info(sdata, "failed moving %pM to auth\n", ap_addr); return false; } return true; } static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 auth_alg, auth_transaction, status_code; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, }; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; bool sae_need_confirm = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 6) return; if (!ifmgd->auth_data || ifmgd->auth_data->done) return; if (!ether_addr_equal(ifmgd->auth_data->ap_addr, mgmt->bssid)) return; auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); info.link_id = ifmgd->auth_data->link_id; if (auth_alg != ifmgd->auth_data->algorithm || (auth_alg != WLAN_AUTH_SAE && auth_transaction != ifmgd->auth_data->expected_transaction) || (auth_alg == WLAN_AUTH_SAE && (auth_transaction < ifmgd->auth_data->expected_transaction || auth_transaction > 2))) { sdata_info(sdata, "%pM unexpected authentication state: alg %d (expected %d) transact %d (expected %d)\n", mgmt->sa, auth_alg, ifmgd->auth_data->algorithm, auth_transaction, ifmgd->auth_data->expected_transaction); goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); if (auth_alg == WLAN_AUTH_SAE && (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || status_code == WLAN_STATUS_SAE_PK)))) { /* waiting for userspace now */ ifmgd->auth_data->waiting = true; ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; ifmgd->auth_data->timeout_started = true; run_again(sdata, ifmgd->auth_data->timeout); if (auth_transaction == 1) sae_need_confirm = true; goto notify_driver; } sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); ieee80211_destroy_auth_data(sdata, false); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); goto notify_driver; } switch (ifmgd->auth_data->algorithm) { case WLAN_AUTH_OPEN: case WLAN_AUTH_LEAP: case WLAN_AUTH_FT: case WLAN_AUTH_SAE: case WLAN_AUTH_FILS_SK: case WLAN_AUTH_FILS_SK_PFS: case WLAN_AUTH_FILS_PK: break; case WLAN_AUTH_SHARED_KEY: if (ifmgd->auth_data->expected_transaction != 4) { ieee80211_auth_challenge(sdata, mgmt, len); /* need another frame */ return; } break; default: WARN_ONCE(1, "invalid auth alg %d", ifmgd->auth_data->algorithm); goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; info.success = 1; drv_event_callback(sdata->local, sdata, &event); if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE || (auth_transaction == 2 && ifmgd->auth_data->expected_transaction == 2)) { if (!ieee80211_mark_sta_auth(sdata)) return; /* ignore frame -- wait for timeout */ } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 1) { sae_need_confirm = true; } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 2) { sdata_info(sdata, "SAE peer confirmed\n"); ifmgd->auth_data->peer_confirmed = true; } cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); notify_driver: if (!sae_need_confirm) drv_mgd_complete_tx(sdata->local, sdata, &info); } #define case_WLAN(type) \ case WLAN_REASON_##type: return #type const char *ieee80211_get_reason_code_string(u16 reason_code) { switch (reason_code) { case_WLAN(UNSPECIFIED); case_WLAN(PREV_AUTH_NOT_VALID); case_WLAN(DEAUTH_LEAVING); case_WLAN(DISASSOC_DUE_TO_INACTIVITY); case_WLAN(DISASSOC_AP_BUSY); case_WLAN(CLASS2_FRAME_FROM_NONAUTH_STA); case_WLAN(CLASS3_FRAME_FROM_NONASSOC_STA); case_WLAN(DISASSOC_STA_HAS_LEFT); case_WLAN(STA_REQ_ASSOC_WITHOUT_AUTH); case_WLAN(DISASSOC_BAD_POWER); case_WLAN(DISASSOC_BAD_SUPP_CHAN); case_WLAN(INVALID_IE); case_WLAN(MIC_FAILURE); case_WLAN(4WAY_HANDSHAKE_TIMEOUT); case_WLAN(GROUP_KEY_HANDSHAKE_TIMEOUT); case_WLAN(IE_DIFFERENT); case_WLAN(INVALID_GROUP_CIPHER); case_WLAN(INVALID_PAIRWISE_CIPHER); case_WLAN(INVALID_AKMP); case_WLAN(UNSUPP_RSN_VERSION); case_WLAN(INVALID_RSN_IE_CAP); case_WLAN(IEEE8021X_FAILED); case_WLAN(CIPHER_SUITE_REJECTED); case_WLAN(DISASSOC_UNSPECIFIED_QOS); case_WLAN(DISASSOC_QAP_NO_BANDWIDTH); case_WLAN(DISASSOC_LOW_ACK); case_WLAN(DISASSOC_QAP_EXCEED_TXOP); case_WLAN(QSTA_LEAVE_QBSS); case_WLAN(QSTA_NOT_USE); case_WLAN(QSTA_REQUIRE_SETUP); case_WLAN(QSTA_TIMEOUT); case_WLAN(QSTA_CIPHER_NOT_SUPP); case_WLAN(MESH_PEER_CANCELED); case_WLAN(MESH_MAX_PEERS); case_WLAN(MESH_CONFIG); case_WLAN(MESH_CLOSE); case_WLAN(MESH_MAX_RETRIES); case_WLAN(MESH_CONFIRM_TIMEOUT); case_WLAN(MESH_INVALID_GTK); case_WLAN(MESH_INCONSISTENT_PARAM); case_WLAN(MESH_INVALID_SECURITY); case_WLAN(MESH_PATH_ERROR); case_WLAN(MESH_PATH_NOFORWARD); case_WLAN(MESH_PATH_DEST_UNREACHABLE); case_WLAN(MAC_EXISTS_IN_MBSS); case_WLAN(MESH_CHAN_REGULATORY); case_WLAN(MESH_CHAN); default: return "<unknown>"; } } static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 2) return; if (!ether_addr_equal(mgmt->bssid, mgmt->sa)) { ieee80211_tdls_handle_disconnect(sdata, mgmt->sa, reason_code); return; } if (ifmgd->associated && ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) { sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", sdata->vif.cfg.ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_set_disassoc(sdata, 0, 0, false, NULL); ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code, false); return; } if (ifmgd->assoc_data && ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->ap_addr)) { sdata_info(sdata, "deauthenticated from %pM while associating (Reason: %u=%s)\n", ifmgd->assoc_data->ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); return; } } static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 2) return; if (!ifmgd->associated || !ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) return; reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); if (!ether_addr_equal(mgmt->bssid, mgmt->sa)) { ieee80211_tdls_handle_disconnect(sdata, mgmt->sa, reason_code); return; } sdata_info(sdata, "disassociated from %pM (Reason: %u=%s)\n", sdata->vif.cfg.ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_set_disassoc(sdata, 0, 0, false, NULL); ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code, false); } static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct link_sta_info *link_sta, const struct ieee802_11_elems *elems) { const struct ieee80211_sta_he_cap *own_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (elems->ext_capab_len < 10) return false; if (!(elems->ext_capab[9] & WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT)) return false; return link_sta->pub->he_cap.he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_RES && own_he_cap && (own_he_cap->he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_REQ); } static u64 ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct ieee802_11_elems *elems) { bool twt = ieee80211_twt_req_supported(sdata, sband, link_sta, elems); if (link->conf->twt_requester != twt) { link->conf->twt_requester = twt; return BSS_CHANGED_TWT; } return 0; } static bool ieee80211_twt_bcast_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf, struct ieee80211_supported_band *sband, struct link_sta_info *link_sta) { const struct ieee80211_sta_he_cap *own_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); return bss_conf->he_support && (link_sta->pub->he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BCAST_TWT) && own_he_cap && (own_he_cap->he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BCAST_TWT); } static void ieee80211_epcs_changed(struct ieee80211_sub_if_data *sdata, bool enabled) { /* in any case this is called, dialog token should be reset */ sdata->u.mgd.epcs.dialog_token = 0; if (sdata->u.mgd.epcs.enabled == enabled) return; sdata->u.mgd.epcs.enabled = enabled; cfg80211_epcs_changed(sdata->dev, enabled); } static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; u8 link_id; if (!sdata->u.mgd.epcs.enabled) return; lockdep_assert_wiphy(local->hw.wiphy); for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee802_11_elems *elems; struct ieee80211_link_data *link; const struct cfg80211_bss_ies *ies; bool ret; rcu_read_lock(); link = sdata_dereference(sdata->link[link_id], sdata); if (!link || !link->conf || !link->conf->bss) { rcu_read_unlock(); continue; } if (link->u.mgd.disable_wmm_tracking) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } ies = rcu_dereference(link->conf->bss->beacon_ies); if (!ies) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); if (!elems) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } ret = _ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set); kfree(elems); rcu_read_unlock(); if (!ret) { ieee80211_set_wmm_default(link, false, false); continue; } ieee80211_mgd_set_link_qos_params(link); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, const u8 *elem_start, unsigned int elem_len, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data ?: sdata->u.mgd.reconf.add_links_data; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_local *local = sdata->local; unsigned int link_id = link->link_id; struct ieee80211_elems_parse_params parse_params = { .mode = link->u.mgd.conn.mode, .start = elem_start, .len = elem_len, .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, }; bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; bool is_s1g = cbss->channel->band == NL80211_BAND_S1GHZ; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_supported_band *sband; struct ieee802_11_elems *elems; const __le16 prof_bss_param_ch_present = cpu_to_le16(IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT); u16 capab_info; bool ret; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return false; if (link_id == assoc_data->assoc_link_id) { capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); /* * we should not get to this flow unless the association was * successful, so set the status directly to success */ assoc_data->link[link_id].status = WLAN_STATUS_SUCCESS; if (elems->ml_basic) { int bss_param_ch_cnt = ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); if (bss_param_ch_cnt < 0) { ret = false; goto out; } bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = link_id; } } else if (elems->parse_error & IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC || !elems->prof || !(elems->prof->control & prof_bss_param_ch_present)) { ret = false; goto out; } else { const u8 *ptr = elems->prof->variable + elems->prof->sta_info_len - 1; int bss_param_ch_cnt; /* * During parsing, we validated that these fields exist, * otherwise elems->prof would have been set to NULL. */ capab_info = get_unaligned_le16(ptr); assoc_data->link[link_id].status = get_unaligned_le16(ptr + 2); bss_param_ch_cnt = ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(elems->prof); bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = link_id; if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { link_info(link, "association response status code=%u\n", assoc_data->link[link_id].status); ret = true; goto out; } } if (!is_s1g && !elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); ret = false; goto out; } link->u.mgd.tdls_chan_switch_prohibited = elems->ext_capab && elems->ext_capab_len >= 5 && (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); /* * Some APs are erroneously not including some information in their * (re)association response frames. Try to recover by using the data * from the beacon or probe response. This seems to afflict mobile * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ if (!ieee80211_hw_check(&local->hw, STRICT) && !is_6ghz && ((assoc_data->wmm && !elems->wmm_param) || (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->ht_cap_elem || !elems->ht_operation)) || (is_5ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems *bss_elems; rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) bss_ies = kmemdup(ies, sizeof(*ies) + ies->len, GFP_ATOMIC); rcu_read_unlock(); if (!bss_ies) { ret = false; goto out; } parse_params.start = bss_ies->data; parse_params.len = bss_ies->len; parse_params.bss = cbss; parse_params.link_id = -1; bss_elems = ieee802_11_parse_elems_full(&parse_params); if (!bss_elems) { ret = false; goto out; } if (assoc_data->wmm && !elems->wmm_param && bss_elems->wmm_param) { elems->wmm_param = bss_elems->wmm_param; sdata_info(sdata, "AP bug: WMM param missing from AssocResp\n"); } /* * Also check if we requested HT/VHT, otherwise the AP doesn't * have to include the IEs in the (re)association response. */ if (!elems->ht_cap_elem && bss_elems->ht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) { elems->ht_cap_elem = bss_elems->ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } if (!elems->ht_operation && bss_elems->ht_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) { elems->ht_operation = bss_elems->ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } if (is_5ghz) { if (!elems->vht_cap_elem && bss_elems->vht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { elems->vht_cap_elem = bss_elems->vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } if (!elems->vht_operation && bss_elems->vht_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { elems->vht_operation = bss_elems->vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } } kfree(bss_elems); } /* * We previously checked these in the beacon/probe response, so * they should be present here. This is just a safety net. * Note that the ieee80211_config_bw() below would also check * for this (and more), but this has better error reporting. */ if (!is_6ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); ret = false; goto out; } if (is_5ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); ret = false; goto out; } /* check/update if AP changed anything in assoc response vs. scan */ if (ieee80211_config_bw(link, elems, link_id == assoc_data->assoc_link_id, changed, le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE)) { ret = false; goto out; } if (WARN_ON(!link->conf->chanreq.oper.chan)) { ret = false; goto out; } sband = local->hw.wiphy->bands[link->conf->chanreq.oper.chan->band]; /* Set up internal HT/VHT capabilities */ if (elems->ht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, elems->ht_cap_elem, link_sta); if (elems->vht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { const struct ieee80211_vht_cap *bss_vht_cap = NULL; const struct cfg80211_bss_ies *ies; /* * Cisco AP module 9115 with FW 17.3 has a bug and sends a * too large maximum MPDU length in the association response * (indicating 12k) that it cannot actually process ... * Work around that. */ rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) { const struct element *elem; elem = cfg80211_find_elem(WLAN_EID_VHT_CAPABILITY, ies->data, ies->len); if (elem && elem->datalen >= sizeof(*bss_vht_cap)) bss_vht_cap = (const void *)elem->data; } if (ieee80211_hw_check(&local->hw, STRICT) && (!bss_vht_cap || memcmp(bss_vht_cap, elems->vht_cap_elem, sizeof(*bss_vht_cap)))) { rcu_read_unlock(); ret = false; link_info(link, "VHT capabilities mismatch\n"); goto out; } ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems->vht_cap_elem, bss_vht_cap, link_sta); rcu_read_unlock(); } if (elems->he_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && elems->he_cap) { ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, elems->he_cap_len, elems->he_6ghz_capa, link_sta); bss_conf->he_support = link_sta->pub->he_cap.has_he; if (elems->rsnx && elems->rsnx_len && (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_PROTECTED_TWT)) bss_conf->twt_protected = true; else bss_conf->twt_protected = false; *changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); if (elems->eht_operation && elems->eht_cap && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_EHT) { ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband, elems->he_cap, elems->he_cap_len, elems->eht_cap, elems->eht_cap_len, link_sta); bss_conf->eht_support = link_sta->pub->eht_cap.has_eht; bss_conf->epcs_support = bss_conf->eht_support && !!(elems->eht_cap->fixed.mac_cap_info[0] & IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS); /* EPCS might be already enabled but a new added link * does not support EPCS. This should not really happen * in practice. */ if (sdata->u.mgd.epcs.enabled && !bss_conf->epcs_support) ieee80211_epcs_teardown(sdata); } else { bss_conf->eht_support = false; bss_conf->epcs_support = false; } } else { bss_conf->he_support = false; bss_conf->twt_requester = false; bss_conf->twt_protected = false; bss_conf->eht_support = false; bss_conf->epcs_support = false; } if (elems->s1g_oper && link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G && elems->s1g_capab) ieee80211_s1g_cap_to_sta_s1g_cap(sdata, elems->s1g_capab, link_sta); bss_conf->twt_broadcast = ieee80211_twt_bcast_support(sdata, bss_conf, sband, link_sta); if (bss_conf->he_support) { bss_conf->he_bss_color.color = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); bss_conf->he_bss_color.partial = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR); bss_conf->he_bss_color.enabled = !le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); if (bss_conf->he_bss_color.enabled) *changed |= BSS_CHANGED_HE_BSS_COLOR; bss_conf->htc_trig_based_pkt_ext = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); bss_conf->frame_time_rts_th = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); bss_conf->uora_exists = !!elems->uora_element; if (elems->uora_element) bss_conf->uora_ocw_range = elems->uora_element[0]; ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems->he_operation); ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems->he_spr); /* TODO: OPEN: what happens if BSS color disable is set? */ } if (cbss->transmitted_bss) { bss_conf->nontransmitted = true; ether_addr_copy(bss_conf->transmitter_bssid, cbss->transmitted_bss->bssid); bss_conf->bssid_indicator = cbss->max_bssid_indicator; bss_conf->bssid_index = cbss->bssid_index; } /* * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data * in their association response, so ignore that data for our own * configuration. If it changed since the last beacon, we'll get the * next beacon and update then. */ /* * If an operating mode notification IE is present, override the * NSS calculation (that would be done in rate_control_rate_init()) * and use the # of streams from that element. */ if (elems->opmode_notif && !(*elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { u8 nss; nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; link_sta->pub->rx_nss = nss; } /* * Always handle WMM once after association regardless * of the first value the AP uses. Setting -1 here has * that effect because the AP values is an unsigned * 4-bit value. */ link->u.mgd.wmm_last_param_set = -1; link->u.mgd.mu_edca_last_param_set = -1; if (link->u.mgd.disable_wmm_tracking) { ieee80211_set_wmm_default(link, false, false); } else if (!ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(link, false, true); /* disable WMM tracking in this case to disable * tracking WMM parameter changes in the beacon if * the parameters weren't actually valid. Doing so * avoids changing parameters very strangely when * the AP is going back and forth between valid and * invalid parameters. */ link->u.mgd.disable_wmm_tracking = true; } if (elems->max_idle_period_ie) { bss_conf->max_idle_period = le16_to_cpu(elems->max_idle_period_ie->max_idle_period); bss_conf->protected_keep_alive = !!(elems->max_idle_period_ie->idle_options & WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE); *changed |= BSS_CHANGED_KEEP_ALIVE; } else { bss_conf->max_idle_period = 0; bss_conf->protected_keep_alive = false; } /* set assoc capability (AID was already set earlier), * ieee80211_set_associated() will tell the driver */ bss_conf->assoc_capability = capab_info; ret = true; out: kfree(elems); kfree(bss_ies); return ret; } static int ieee80211_mgd_setup_link_sta(struct ieee80211_link_data *link, struct sta_info *sta, struct link_sta_info *link_sta, struct cfg80211_bss *cbss) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss = (void *)cbss->priv; u32 rates = 0, basic_rates = 0; bool have_higher_than_11mbit = false; int min_rate = INT_MAX, min_rate_index = -1; struct ieee80211_supported_band *sband; memcpy(link_sta->addr, cbss->bssid, ETH_ALEN); memcpy(link_sta->pub->addr, cbss->bssid, ETH_ALEN); /* TODO: S1G Basic Rate Set is expressed elsewhere */ if (cbss->channel->band == NL80211_BAND_S1GHZ) { ieee80211_s1g_sta_rate_init(sta); return 0; } sband = local->hw.wiphy->bands[cbss->channel->band]; ieee80211_get_rates(sband, bss->supp_rates, bss->supp_rates_len, NULL, 0, &rates, &basic_rates, NULL, &have_higher_than_11mbit, &min_rate, &min_rate_index); /* * This used to be a workaround for basic rates missing * in the association response frame. Now that we no * longer use the basic rates from there, it probably * doesn't happen any more, but keep the workaround so * in case some *other* APs are buggy in different ways * we can connect -- with a warning. * Allow this workaround only in case the AP provided at least * one rate. */ if (min_rate_index < 0) { link_info(link, "No legacy rates in association response\n"); return -EINVAL; } else if (!basic_rates) { link_info(link, "No basic rates, using min rate instead\n"); basic_rates = BIT(min_rate_index); } if (rates) link_sta->pub->supp_rates[cbss->channel->band] = rates; else link_info(link, "No rates found, keeping mandatory only\n"); link->conf->basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ link->operating_11g_mode = sband->band == NL80211_BAND_2GHZ && have_higher_than_11mbit; return 0; } static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, struct cfg80211_bss *cbss) { struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp; const struct element *ht_cap_elem, *vht_cap_elem; const struct cfg80211_bss_ies *ies; const struct ieee80211_ht_cap *ht_cap; const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; const struct element *he_cap_elem; u16 mcs_80_map, mcs_160_map; int i, mcs_nss_size; bool support_160; u8 chains = 1; if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_HT) return chains; ht_cap_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_HT_CAPABILITY); if (ht_cap_elem && ht_cap_elem->datalen >= sizeof(*ht_cap)) { ht_cap = (void *)ht_cap_elem->data; chains = ieee80211_mcs_to_chains(&ht_cap->mcs); /* * TODO: use "Tx Maximum Number Spatial Streams Supported" and * "Tx Unequal Modulation Supported" fields. */ } if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_VHT) return chains; vht_cap_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_VHT_CAPABILITY); if (vht_cap_elem && vht_cap_elem->datalen >= sizeof(*vht_cap)) { u8 nss; u16 tx_mcs_map; vht_cap = (void *)vht_cap_elem->data; tx_mcs_map = le16_to_cpu(vht_cap->supp_mcs.tx_mcs_map); for (nss = 8; nss > 0; nss--) { if (((tx_mcs_map >> (2 * (nss - 1))) & 3) != IEEE80211_VHT_MCS_NOT_SUPPORTED) break; } /* TODO: use "Tx Highest Supported Long GI Data Rate" field? */ chains = max(chains, nss); } if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_HE) return chains; ies = rcu_dereference(cbss->ies); he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies->data, ies->len); if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap) + 1) return chains; /* skip one byte ext_tag_id */ he_cap = (void *)(he_cap_elem->data + 1); mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap); /* invalid HE IE */ if (he_cap_elem->datalen < 1 + mcs_nss_size + sizeof(*he_cap)) return chains; /* mcs_nss is right after he_cap info */ he_mcs_nss_supp = (void *)(he_cap + 1); mcs_80_map = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80); for (i = 7; i >= 0; i--) { u8 mcs_80 = mcs_80_map >> (2 * i) & 3; if (mcs_80 != IEEE80211_VHT_MCS_NOT_SUPPORTED) { chains = max_t(u8, chains, i + 1); break; } } support_160 = he_cap->phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; if (!support_160) return chains; mcs_160_map = le16_to_cpu(he_mcs_nss_supp->tx_mcs_160); for (i = 7; i >= 0; i--) { u8 mcs_160 = mcs_160_map >> (2 * i) & 3; if (mcs_160 != IEEE80211_VHT_MCS_NOT_SUPPORTED) { chains = max_t(u8, chains, i + 1); break; } } return chains; } static void ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_assoc_request *req, bool wmm_used, int link_id, struct ieee80211_conn_settings *conn) { struct ieee80211_sta_ht_cap sta_ht_cap = sband->ht_cap; bool is_5ghz = sband->band == NL80211_BAND_5GHZ; bool is_6ghz = sband->band == NL80211_BAND_6GHZ; const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; struct ieee80211_sta_vht_cap vht_cap; if (sband->band == NL80211_BAND_S1GHZ) { conn->mode = IEEE80211_CONN_MODE_S1G; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_dbg(sdata, "operating as S1G STA\n"); return; } conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); if (req && req->flags & ASSOC_REQ_DISABLE_HT) { mlme_link_id_dbg(sdata, link_id, "HT disabled by flag, limiting to legacy\n"); goto out; } if (!wmm_used) { mlme_link_id_dbg(sdata, link_id, "WMM/QoS not supported, limiting to legacy\n"); goto out; } if (req) { unsigned int i; for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) { if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 || req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP || req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { netdev_info(sdata->dev, "WEP/TKIP use, limiting to legacy\n"); goto out; } } } if (!sta_ht_cap.ht_supported && !is_6ghz) { mlme_link_id_dbg(sdata, link_id, "HT not supported (and not on 6 GHz), limiting to legacy\n"); goto out; } /* HT is fine */ conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_CONN_BW_LIMIT_40 : IEEE80211_CONN_BW_LIMIT_20; memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); if (req && req->flags & ASSOC_REQ_DISABLE_VHT) { mlme_link_id_dbg(sdata, link_id, "VHT disabled by flag, limiting to HT\n"); goto out; } if (vht_cap.vht_supported && is_5ghz) { bool have_80mhz = false; unsigned int i; if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) { mlme_link_id_dbg(sdata, link_id, "no 40 MHz support on 5 GHz, limiting to HT\n"); goto out; } /* Allow VHT if at least one channel on the sband supports 80 MHz */ for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (!have_80mhz) { mlme_link_id_dbg(sdata, link_id, "no 80 MHz channel support on 5 GHz, limiting to HT\n"); goto out; } } else if (is_5ghz) { /* !vht_supported but on 5 GHz */ mlme_link_id_dbg(sdata, link_id, "no VHT support on 5 GHz, limiting to HT\n"); goto out; } /* VHT - if we have - is fine, including 80 MHz, check 160 below again */ if (sband->band != NL80211_BAND_2GHZ) { conn->mode = IEEE80211_CONN_MODE_VHT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; } if (is_5ghz && !(vht_cap.cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ))) { conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; mlme_link_id_dbg(sdata, link_id, "no VHT 160 MHz capability on 5 GHz, limiting to 80 MHz"); } if (req && req->flags & ASSOC_REQ_DISABLE_HE) { mlme_link_id_dbg(sdata, link_id, "HE disabled by flag, limiting to HT/VHT\n"); goto out; } he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) { WARN_ON(is_6ghz); mlme_link_id_dbg(sdata, link_id, "no HE support, limiting to HT/VHT\n"); goto out; } /* so we have HE */ conn->mode = IEEE80211_CONN_MODE_HE; /* check bandwidth */ switch (sband->band) { default: case NL80211_BAND_2GHZ: if (he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) break; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_link_id_dbg(sdata, link_id, "no 40 MHz HE cap in 2.4 GHz, limiting to 20 MHz\n"); break; case NL80211_BAND_5GHZ: if (!(he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)) { conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_link_id_dbg(sdata, link_id, "no 40/80 MHz HE cap in 5 GHz, limiting to 20 MHz\n"); break; } if (!(he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)) { conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_80); mlme_link_id_dbg(sdata, link_id, "no 160 MHz HE cap in 5 GHz, limiting to 80 MHz\n"); } break; case NL80211_BAND_6GHZ: if (he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) break; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_80); mlme_link_id_dbg(sdata, link_id, "no 160 MHz HE cap in 6 GHz, limiting to 80 MHz\n"); break; } if (req && req->flags & ASSOC_REQ_DISABLE_EHT) { mlme_link_id_dbg(sdata, link_id, "EHT disabled by flag, limiting to HE\n"); goto out; } eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); if (!eht_cap) { mlme_link_id_dbg(sdata, link_id, "no EHT support, limiting to HE\n"); goto out; } /* we have EHT */ conn->mode = IEEE80211_CONN_MODE_EHT; /* check bandwidth */ if (is_6ghz && eht_cap->eht_cap_elem.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) conn->bw_limit = IEEE80211_CONN_BW_LIMIT_320; else if (is_6ghz) mlme_link_id_dbg(sdata, link_id, "no EHT 320 MHz cap in 6 GHz, limiting to 160 MHz\n"); out: mlme_link_id_dbg(sdata, link_id, "determined local STA to be %s, BW limited to %d MHz\n", ieee80211_conn_mode_str(conn->mode), 20 * (1 << conn->bw_limit)); } static void ieee80211_determine_our_sta_mode_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_auth_request *req, bool wmm_used, struct ieee80211_conn_settings *conn) { ieee80211_determine_our_sta_mode(sdata, sband, NULL, wmm_used, req->link_id > 0 ? req->link_id : 0, conn); } static void ieee80211_determine_our_sta_mode_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_assoc_request *req, bool wmm_used, int link_id, struct ieee80211_conn_settings *conn) { struct ieee80211_conn_settings tmp; WARN_ON(!req); ieee80211_determine_our_sta_mode(sdata, sband, req, wmm_used, link_id, &tmp); conn->mode = min_t(enum ieee80211_conn_mode, conn->mode, tmp.mode); conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, tmp.bw_limit); } static enum ieee80211_ap_reg_power ieee80211_ap_power_type(u8 control) { switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: return IEEE80211_REG_SP_AP; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return IEEE80211_REG_VLP_AP; default: return IEEE80211_REG_UNSET_AP; } } static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, int link_id, struct cfg80211_bss *cbss, bool mlo, struct ieee80211_conn_settings *conn, unsigned long *userspace_selectors) { struct ieee80211_local *local = sdata->local; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef; struct ieee802_11_elems *elems; int ret; lockdep_assert_wiphy(local->hw.wiphy); rcu_read_lock(); elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id, &chanreq, &ap_chandef, userspace_selectors); if (IS_ERR(elems)) { rcu_read_unlock(); return PTR_ERR(elems); } if (mlo && !elems->ml_basic) { sdata_info(sdata, "Rejecting MLO as it is not supported by AP\n"); rcu_read_unlock(); kfree(elems); return -EINVAL; } if (link && is_6ghz && conn->mode >= IEEE80211_CONN_MODE_HE) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; if (elems->pwr_constr_elem) link->conf->pwr_reduction = *elems->pwr_constr_elem; he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation); if (he_6ghz_oper) link->conf->power_type = ieee80211_ap_power_type(he_6ghz_oper->control); else link_info(link, "HE 6 GHz operation missing (on %d MHz), expect issues\n", cbss->channel->center_freq); link->conf->tpe = elems->tpe; ieee80211_rearrange_tpe(&link->conf->tpe, &ap_chandef, &chanreq.oper); } rcu_read_unlock(); /* the element data was RCU protected so no longer valid anyway */ kfree(elems); elems = NULL; if (!link) return 0; rcu_read_lock(); link->needed_rx_chains = min(ieee80211_max_rx_chains(link, cbss), local->rx_chains); rcu_read_unlock(); /* * If this fails (possibly due to channel context sharing * on incompatible channels, e.g. 80+80 and 160 sharing the * same control channel) try to use a smaller bandwidth. */ ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); /* don't downgrade for 5 and 10 MHz channels, though. */ if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 || chanreq.oper.width == NL80211_CHAN_WIDTH_10) return ret; while (ret && chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT) { ieee80211_chanreq_downgrade(&chanreq, conn); ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); } return ret; } static bool ieee80211_get_dtim(const struct cfg80211_bss_ies *ies, u8 *dtim_count, u8 *dtim_period) { const u8 *tim_ie = cfg80211_find_ie(WLAN_EID_TIM, ies->data, ies->len); const u8 *idx_ie = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, ies->data, ies->len); const struct ieee80211_tim_ie *tim = NULL; const struct ieee80211_bssid_index *idx; bool valid = tim_ie && tim_ie[1] >= 2; if (valid) tim = (void *)(tim_ie + 2); if (dtim_count) *dtim_count = valid ? tim->dtim_count : 0; if (dtim_period) *dtim_period = valid ? tim->dtim_period : 0; /* Check if value is overridden by non-transmitted profile */ if (!idx_ie || idx_ie[1] < 3) return valid; idx = (void *)(idx_ie + 2); if (dtim_count) *dtim_count = idx->dtim_count; if (dtim_period) *dtim_period = idx->dtim_period; return true; } static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, struct ieee802_11_elems *elems, const u8 *elem_start, unsigned int elem_len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; struct ieee80211_local *local = sdata->local; unsigned int link_id; struct sta_info *sta; u64 changed[IEEE80211_MLD_MAX_NUM_LINKS] = {}; u16 valid_links = 0, dormant_links = 0; int err; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * station info was already allocated and inserted before * the association and should be available to us */ sta = sta_info_get(sdata, assoc_data->ap_addr); if (WARN_ON(!sta)) goto out_err; sta->sta.spp_amsdu = assoc_data->spp_amsdu; if (ieee80211_vif_is_mld(&sdata->vif)) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!assoc_data->link[link_id].bss) continue; valid_links |= BIT(link_id); if (assoc_data->link[link_id].disabled) dormant_links |= BIT(link_id); if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_allocate_link(sta, link_id); if (err) goto out_err; } } ieee80211_vif_set_links(sdata, valid_links, dormant_links); } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; struct link_sta_info *link_sta; if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) goto out_err; if (ieee80211_vif_is_mld(&sdata->vif)) link_info(link, "local address %pM, AP link address %pM%s\n", link->conf->addr, assoc_data->link[link_id].bss->bssid, link_id == assoc_data->assoc_link_id ? " (assoc)" : ""); link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (WARN_ON(!link_sta)) goto out_err; if (!link->u.mgd.have_beacon) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); ies = rcu_dereference(cbss->beacon_ies); if (ies) link->u.mgd.have_beacon = true; else ies = rcu_dereference(cbss->ies); ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, &link->u.mgd.dtim_period); link->conf->beacon_int = cbss->beacon_interval; rcu_read_unlock(); } link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; if (link_id != assoc_data->assoc_link_id) { link->u.mgd.conn = assoc_data->link[link_id].conn; err = ieee80211_prep_channel(sdata, link, link_id, cbss, true, &link->u.mgd.conn, sdata->u.mgd.userspace_selectors); if (err) { link_info(link, "prep_channel failed\n"); goto out_err; } } err = ieee80211_mgd_setup_link_sta(link, sta, link_sta, assoc_data->link[link_id].bss); if (err) goto out_err; if (!ieee80211_assoc_config_link(link, link_sta, assoc_data->link[link_id].bss, mgmt, elem_start, elem_len, &changed[link_id])) goto out_err; if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { valid_links &= ~BIT(link_id); ieee80211_sta_remove_link(sta, link_id); continue; } if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_activate_link(sta, link_id); if (err) goto out_err; } } /* links might have changed due to rejected ones, set them again */ ieee80211_vif_set_links(sdata, valid_links, dormant_links); rate_control_rate_init_all_links(sta); if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) { set_sta_flag(sta, WLAN_STA_MFP); sta->sta.mfp = true; } else { sta->sta.mfp = false; } ieee80211_sta_set_max_amsdu_subframes(sta, elems->ext_capab, elems->ext_capab_len); sta->sta.wme = (elems->wmm_param || elems->s1g_capab) && local->hw.queues >= IEEE80211_NUM_ACS; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) err = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); if (err) { sdata_info(sdata, "failed to move station %pM to desired state\n", sta->sta.addr); WARN_ON(__sta_info_destroy(sta)); goto out_err; } if (sdata->wdev.use_4addr) drv_sta_set_4addr(local, sdata, &sta->sta, true); ieee80211_set_associated(sdata, assoc_data, changed); /* * If we're using 4-addr mode, let the AP know that we're * doing so, so that it can create the STA VLAN on its side */ if (ifmgd->use_4addr) ieee80211_send_4addr_nullfunc(local, sdata); /* * Start timer to probe the connection to the AP now. * Also start the timer that will detect beacon loss. */ ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); return true; out_err: eth_zero_addr(sdata->vif.cfg.ap_addr); return false; } static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; u16 capab_info, status_code, aid; struct ieee80211_elems_parse_params parse_params = { .bss = NULL, .link_id = -1, .from_ap = true, }; struct ieee802_11_elems *elems; int ac; const u8 *elem_start; unsigned int elem_len; bool reassoc; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = ASSOC_EVENT, }; struct ieee80211_prep_tx_info info = {}; struct cfg80211_rx_assoc_resp_data resp = { .uapsd_queues = -1, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; u16 max_aid = IEEE80211_MAX_AID; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!assoc_data) return; info.link_id = assoc_data->assoc_link_id; parse_params.mode = assoc_data->link[assoc_data->assoc_link_id].conn.mode; if (!ether_addr_equal(assoc_data->ap_addr, mgmt->bssid) || !ether_addr_equal(assoc_data->ap_addr, mgmt->sa)) return; /* * AssocResp and ReassocResp have identical structure, so process both * of them in this function. */ if (len < 24 + 6) return; reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); if (assoc_data->s1g) { elem_start = mgmt->u.s1g_assoc_resp.variable; max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID; } else { elem_start = mgmt->u.assoc_resp.variable; } /* * Note: this may not be perfect, AP might misbehave - if * anyone needs to rely on perfect complete notification * with the exact right subtype, then we need to track what * we actually transmitted. */ info.subtype = reassoc ? IEEE80211_STYPE_REASSOC_REQ : IEEE80211_STYPE_ASSOC_REQ; if (assoc_data->fils_kek_len && fils_decrypt_assoc_resp(sdata, (u8 *)mgmt, &len, assoc_data) < 0) return; elem_len = len - (elem_start - (u8 *)mgmt); parse_params.start = elem_start; parse_params.len = elem_len; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) goto notify_driver; if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* * The 5 MSB of the AID field are reserved for a non-S1G STA. For * an S1G STA the 3 MSBs are reserved. * (802.11-2016 9.4.1.8 AID field). */ aid &= assoc_data->s1g ? 0x1fff : 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", reassoc ? "Rea" : "A", assoc_data->ap_addr, capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14)))); ifmgd->broken_ap = false; if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY && elems->timeout_int && elems->timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) { u32 tu, ms; cfg80211_assoc_comeback(sdata->dev, assoc_data->ap_addr, le32_to_cpu(elems->timeout_int->value)); tu = le32_to_cpu(elems->timeout_int->value); ms = tu * 1024 / 1000; sdata_info(sdata, "%pM rejected association temporarily; comeback duration %u TU (%u ms)\n", assoc_data->ap_addr, tu, ms); assoc_data->timeout = jiffies + msecs_to_jiffies(ms); assoc_data->timeout_started = true; assoc_data->comeback = true; if (ms > IEEE80211_ASSOC_TIMEOUT) run_again(sdata, assoc_data->timeout); goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { sdata_info(sdata, "%pM denied association (code=%d)\n", assoc_data->ap_addr, status_code); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { if (aid == 0 || aid > max_aid) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); aid = 0; ifmgd->broken_ap = true; } if (ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_mle_basic_common_info *common; if (!elems->ml_basic) { sdata_info(sdata, "MLO association with %pM but no (basic) multi-link element in response!\n", assoc_data->ap_addr); goto abandon_assoc; } common = (void *)elems->ml_basic->variable; if (memcmp(assoc_data->ap_addr, common->mld_mac_addr, ETH_ALEN)) { sdata_info(sdata, "AP MLD MAC address mismatch: got %pM expected %pM\n", common->mld_mac_addr, assoc_data->ap_addr); goto abandon_assoc; } sdata->vif.cfg.eml_cap = ieee80211_mle_get_eml_cap((const void *)elems->ml_basic); sdata->vif.cfg.eml_med_sync_delay = ieee80211_mle_get_eml_med_sync_delay((const void *)elems->ml_basic); sdata->vif.cfg.mld_capa_op = ieee80211_mle_get_mld_capa_op((const void *)elems->ml_basic); } sdata->vif.cfg.aid = aid; sdata->vif.cfg.s1g = assoc_data->s1g; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; drv_event_callback(sdata->local, sdata, &event); sdata_info(sdata, "associated\n"); info.success = 1; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link; if (!assoc_data->link[link_id].bss) continue; resp.links[link_id].bss = assoc_data->link[link_id].bss; ether_addr_copy(resp.links[link_id].addr, assoc_data->link[link_id].addr); resp.links[link_id].status = assoc_data->link[link_id].status; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; /* get uapsd queues configuration - same for all links */ resp.uapsd_queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (link->tx_conf[ac].uapsd) resp.uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; } if (ieee80211_vif_is_mld(&sdata->vif)) { ether_addr_copy(ap_mld_addr, sdata->vif.cfg.ap_addr); resp.ap_mld_addr = ap_mld_addr; } ieee80211_destroy_assoc_data(sdata, status_code == WLAN_STATUS_SUCCESS ? ASSOC_SUCCESS : ASSOC_REJECTED); resp.buf = (u8 *)mgmt; resp.len = len; resp.req_ies = ifmgd->assoc_req_ies; resp.req_ies_len = ifmgd->assoc_req_ies_len; cfg80211_rx_assoc_resp(sdata->dev, &resp); notify_driver: drv_mgd_complete_tx(sdata->local, sdata, &info); kfree(elems); return; abandon_assoc: ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); goto notify_driver; } static void ieee80211_rx_bss_info(struct ieee80211_link_data *link, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss; struct ieee80211_channel *channel; lockdep_assert_wiphy(sdata->local->hw.wiphy); channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, channel); if (bss) { link->conf->beacon_rate = bss->beacon_rate; ieee80211_rx_bss_put(local, bss); } } static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_link_data *link, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_if_managed *ifmgd; struct ieee80211_rx_status *rx_status = (void *) skb->cb; struct ieee80211_channel *channel; size_t baselen, len = skb->len; ifmgd = &sdata->u.mgd; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * According to Draft P802.11ax D6.0 clause 26.17.2.3.2: * "If a 6 GHz AP receives a Probe Request frame and responds with * a Probe Response frame [..], the Address 1 field of the Probe * Response frame shall be set to the broadcast address [..]" * So, on 6GHz band we should also accept broadcast responses. */ channel = ieee80211_get_channel(sdata->local->hw.wiphy, rx_status->freq); if (!channel) return; if (!ether_addr_equal(mgmt->da, sdata->vif.addr) && (channel->band != NL80211_BAND_6GHZ || !is_broadcast_ether_addr(mgmt->da))) return; /* ignore ProbeResp to foreign address */ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; if (baselen > len) return; ieee80211_rx_bss_info(link, mgmt, len, rx_status); if (ifmgd->associated && ether_addr_equal(mgmt->bssid, link->u.mgd.bssid)) ieee80211_reset_ap_probe(sdata); } /* * This is the canonical list of information elements we care about, * the filter code also gives us all changes to the Microsoft OUI * (00:50:F2) vendor IE which is used for WMM which we need to track, * as well as the DTPC IE (part of the Cisco OUI) used for signaling * changes to requested client power. * * We implement beacon filtering in software since that means we can * avoid processing the frame here and in cfg80211, and userspace * will not be able to tell whether the hardware supports it or not. * * XXX: This list needs to be dynamic -- userspace needs to be able to * add items it requires. It also needs to be able to tell us to * look out for other vendor IEs. */ static const u64 care_about_ies = (1ULL << WLAN_EID_COUNTRY) | (1ULL << WLAN_EID_ERP_INFO) | (1ULL << WLAN_EID_CHANNEL_SWITCH) | (1ULL << WLAN_EID_PWR_CONSTRAINT) | (1ULL << WLAN_EID_HT_CAPABILITY) | (1ULL << WLAN_EID_HT_OPERATION) | (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN); static void ieee80211_handle_beacon_sig(struct ieee80211_link_data *link, struct ieee80211_if_managed *ifmgd, struct ieee80211_bss_conf *bss_conf, struct ieee80211_local *local, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; /* Track average RSSI from the Beacon frames of the current AP */ if (!link->u.mgd.tracking_signal_avg) { link->u.mgd.tracking_signal_avg = true; ewma_beacon_signal_init(&link->u.mgd.ave_beacon_signal); link->u.mgd.last_cqm_event_signal = 0; link->u.mgd.count_beacon_signal = 1; link->u.mgd.last_ave_beacon_signal = 0; } else { link->u.mgd.count_beacon_signal++; } ewma_beacon_signal_add(&link->u.mgd.ave_beacon_signal, -rx_status->signal); if (ifmgd->rssi_min_thold != ifmgd->rssi_max_thold && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_sig = link->u.mgd.last_ave_beacon_signal; struct ieee80211_event event = { .type = RSSI_EVENT, }; /* * if signal crosses either of the boundaries, invoke callback * with appropriate parameters */ if (sig > ifmgd->rssi_max_thold && (last_sig <= ifmgd->rssi_min_thold || last_sig == 0)) { link->u.mgd.last_ave_beacon_signal = sig; event.u.rssi.data = RSSI_EVENT_HIGH; drv_event_callback(local, sdata, &event); } else if (sig < ifmgd->rssi_min_thold && (last_sig >= ifmgd->rssi_max_thold || last_sig == 0)) { link->u.mgd.last_ave_beacon_signal = sig; event.u.rssi.data = RSSI_EVENT_LOW; drv_event_callback(local, sdata, &event); } } if (bss_conf->cqm_rssi_thold && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT && !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_event = link->u.mgd.last_cqm_event_signal; int thold = bss_conf->cqm_rssi_thold; int hyst = bss_conf->cqm_rssi_hyst; if (sig < thold && (last_event == 0 || sig < last_event - hyst)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, sig, GFP_KERNEL); } else if (sig > thold && (last_event == 0 || sig > last_event + hyst)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, sig, GFP_KERNEL); } } if (bss_conf->cqm_rssi_low && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_event = link->u.mgd.last_cqm_event_signal; int low = bss_conf->cqm_rssi_low; int high = bss_conf->cqm_rssi_high; if (sig < low && (last_event == 0 || last_event >= low)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, sig, GFP_KERNEL); } else if (sig > high && (last_event == 0 || last_event <= high)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, sig, GFP_KERNEL); } } } static bool ieee80211_rx_our_beacon(const u8 *tx_bssid, struct cfg80211_bss *bss) { if (ether_addr_equal(tx_bssid, bss->bssid)) return true; if (!bss->transmitted_bss) return false; return ether_addr_equal(tx_bssid, bss->transmitted_bss->bssid); } static void ieee80211_ml_reconf_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.ml_reconf_work.work); u16 new_valid_links, new_active_links, new_dormant_links; int ret; if (!sdata->u.mgd.removed_links) return; sdata_info(sdata, "MLO Reconfiguration: work: valid=0x%x, removed=0x%x\n", sdata->vif.valid_links, sdata->u.mgd.removed_links); new_valid_links = sdata->vif.valid_links & ~sdata->u.mgd.removed_links; if (new_valid_links == sdata->vif.valid_links) return; if (!new_valid_links || !(new_valid_links & ~sdata->vif.dormant_links)) { sdata_info(sdata, "No valid links after reconfiguration\n"); ret = -EINVAL; goto out; } new_active_links = sdata->vif.active_links & ~sdata->u.mgd.removed_links; if (new_active_links != sdata->vif.active_links) { if (!new_active_links) new_active_links = BIT(ffs(new_valid_links & ~sdata->vif.dormant_links) - 1); ret = ieee80211_set_active_links(&sdata->vif, new_active_links); if (ret) { sdata_info(sdata, "Failed setting active links\n"); goto out; } } new_dormant_links = sdata->vif.dormant_links & ~sdata->u.mgd.removed_links; ret = ieee80211_vif_set_links(sdata, new_valid_links, new_dormant_links); if (ret) sdata_info(sdata, "Failed setting valid links\n"); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); out: if (!ret) cfg80211_links_removed(sdata->dev, sdata->u.mgd.removed_links); else __ieee80211_disconnect(sdata); sdata->u.mgd.removed_links = 0; } static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems) { const struct element *sub; unsigned long removed_links = 0; u16 link_removal_timeout[IEEE80211_MLD_MAX_NUM_LINKS] = {}; u8 link_id; u32 delay; if (!ieee80211_vif_is_mld(&sdata->vif) || !elems->ml_reconf) return; /* Directly parse the sub elements as the common information doesn't * hold any useful information. */ for_each_mle_subelement(sub, (const u8 *)elems->ml_reconf, elems->ml_reconf_len) { struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; u8 *pos = prof->variable; u16 control; if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) continue; if (!ieee80211_mle_reconf_sta_prof_size_ok(sub->data, sub->datalen)) return; control = le16_to_cpu(prof->control); link_id = control & IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID; removed_links |= BIT(link_id); /* the MAC address should not be included, but handle it */ if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT) pos += 6; /* According to Draft P802.11be_D3.0, the control should * include the AP Removal Timer present. If the AP Removal Timer * is not present assume immediate removal. */ if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT) link_removal_timeout[link_id] = get_unaligned_le16(pos); } removed_links &= sdata->vif.valid_links; if (!removed_links) { /* In case the removal was cancelled, abort it */ if (sdata->u.mgd.removed_links) { sdata->u.mgd.removed_links = 0; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); } return; } delay = 0; for_each_set_bit(link_id, &removed_links, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_bss_conf *link_conf = sdata_dereference(sdata->vif.link_conf[link_id], sdata); u32 link_delay; if (!link_conf) { removed_links &= ~BIT(link_id); continue; } if (link_removal_timeout[link_id] < 1) link_delay = 0; else link_delay = link_conf->beacon_int * (link_removal_timeout[link_id] - 1); if (!delay) delay = link_delay; else delay = min(delay, link_delay); } sdata->u.mgd.removed_links = removed_links; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work, TU_TO_JIFFIES(delay)); } static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, u16 active_links, u16 dormant_links, u16 suspended_links) { u64 changed = 0; int ret; if (!active_links) { ret = -EINVAL; goto out; } /* If there is an active negotiated TTLM, it should be discarded by * the new negotiated/advertised TTLM. */ if (sdata->vif.neg_ttlm.valid) { memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); sdata->vif.suspended_links = 0; changed = BSS_CHANGED_MLD_TTLM; } if (sdata->vif.active_links != active_links) { /* usable links are affected when active_links are changed, * so notify the driver about the status change */ changed |= BSS_CHANGED_MLD_VALID_LINKS; active_links &= sdata->vif.active_links; if (!active_links) active_links = BIT(__ffs(sdata->vif.valid_links & ~dormant_links)); ret = ieee80211_set_active_links(&sdata->vif, active_links); if (ret) { sdata_info(sdata, "Failed to set TTLM active links\n"); goto out; } } ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links, dormant_links); if (ret) { sdata_info(sdata, "Failed to set TTLM dormant links\n"); goto out; } sdata->vif.suspended_links = suspended_links; if (sdata->vif.suspended_links) changed |= BSS_CHANGED_MLD_TTLM; ieee80211_vif_cfg_change_notify(sdata, changed); out: if (ret) ieee80211_disconnect(&sdata->vif, false); return ret; } static void ieee80211_tid_to_link_map_work(struct wiphy *wiphy, struct wiphy_work *work) { u16 new_active_links, new_dormant_links; struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.ttlm_work.work); new_active_links = sdata->u.mgd.ttlm_info.map & sdata->vif.valid_links; new_dormant_links = ~sdata->u.mgd.ttlm_info.map & sdata->vif.valid_links; ieee80211_vif_set_links(sdata, sdata->vif.valid_links, 0); if (ieee80211_ttlm_set_links(sdata, new_active_links, new_dormant_links, 0)) return; sdata->u.mgd.ttlm_info.active = true; sdata->u.mgd.ttlm_info.switch_time = 0; } static u16 ieee80211_get_ttlm(u8 bm_size, u8 *data) { if (bm_size == 1) return *data; else return get_unaligned_le16(data); } static int ieee80211_parse_adv_t2l(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ttlm_elem *ttlm, struct ieee80211_adv_ttlm_info *ttlm_info) { /* The element size was already validated in * ieee80211_tid_to_link_map_size_ok() */ u8 control, link_map_presence, map_size, tid; u8 *pos; memset(ttlm_info, 0, sizeof(*ttlm_info)); pos = (void *)ttlm->optional; control = ttlm->control; if ((control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) || !(control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)) return 0; if ((control & IEEE80211_TTLM_CONTROL_DIRECTION) != IEEE80211_TTLM_DIRECTION_BOTH) { sdata_info(sdata, "Invalid advertised T2L map direction\n"); return -EINVAL; } link_map_presence = *pos; pos++; ttlm_info->switch_time = get_unaligned_le16(pos); /* Since ttlm_info->switch_time == 0 means no switch time, bump it * by 1. */ if (!ttlm_info->switch_time) ttlm_info->switch_time = 1; pos += 2; if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) { ttlm_info->duration = pos[0] | pos[1] << 8 | pos[2] << 16; pos += 3; } if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) map_size = 1; else map_size = 2; /* According to Draft P802.11be_D3.0 clause 35.3.7.1.7, an AP MLD shall * not advertise a TID-to-link mapping that does not map all TIDs to the * same link set, reject frame if not all links have mapping */ if (link_map_presence != 0xff) { sdata_info(sdata, "Invalid advertised T2L mapping presence indicator\n"); return -EINVAL; } ttlm_info->map = ieee80211_get_ttlm(map_size, pos); if (!ttlm_info->map) { sdata_info(sdata, "Invalid advertised T2L map for TID 0\n"); return -EINVAL; } pos += map_size; for (tid = 1; tid < 8; tid++) { u16 map = ieee80211_get_ttlm(map_size, pos); if (map != ttlm_info->map) { sdata_info(sdata, "Invalid advertised T2L map for tid %d\n", tid); return -EINVAL; } pos += map_size; } return 0; } static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, u64 beacon_ts) { u8 i; int ret; if (!ieee80211_vif_is_mld(&sdata->vif)) return; if (!elems->ttlm_num) { if (sdata->u.mgd.ttlm_info.switch_time) { /* if a planned TID-to-link mapping was cancelled - * abort it */ wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); } else if (sdata->u.mgd.ttlm_info.active) { /* if no TID-to-link element, set to default mapping in * which all TIDs are mapped to all setup links */ ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links, 0); if (ret) { sdata_info(sdata, "Failed setting valid/dormant links\n"); return; } ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); } memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); return; } for (i = 0; i < elems->ttlm_num; i++) { struct ieee80211_adv_ttlm_info ttlm_info; u32 res; res = ieee80211_parse_adv_t2l(sdata, elems->ttlm[i], &ttlm_info); if (res) { __ieee80211_disconnect(sdata); return; } if (ttlm_info.switch_time) { u16 beacon_ts_tu, st_tu, delay; u32 delay_jiffies; u64 mask; /* The t2l map switch time is indicated with a partial * TSF value (bits 10 to 25), get the partial beacon TS * as well, and calc the delay to the start time. */ mask = GENMASK_ULL(25, 10); beacon_ts_tu = (beacon_ts & mask) >> 10; st_tu = ttlm_info.switch_time; delay = st_tu - beacon_ts_tu; /* * If the switch time is far in the future, then it * could also be the previous switch still being * announced. * We can simply ignore it for now, if it is a future * switch the AP will continue to announce it anyway. */ if (delay > IEEE80211_ADV_TTLM_ST_UNDERFLOW) return; delay_jiffies = TU_TO_JIFFIES(delay); /* Link switching can take time, so schedule it * 100ms before to be ready on time */ if (delay_jiffies > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) delay_jiffies -= IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS; else delay_jiffies = 0; sdata->u.mgd.ttlm_info = ttlm_info; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work, delay_jiffies); return; } } } static void ieee80211_mgd_check_cross_link_csa(struct ieee80211_sub_if_data *sdata, int reporting_link_id, struct ieee802_11_elems *elems) { const struct element *sta_profiles[IEEE80211_MLD_MAX_NUM_LINKS] = {}; ssize_t sta_profiles_len[IEEE80211_MLD_MAX_NUM_LINKS] = {}; const struct element *sub; const u8 *subelems; size_t subelems_len; u8 common_size; int link_id; if (!ieee80211_mle_size_ok((u8 *)elems->ml_basic, elems->ml_basic_len)) return; common_size = ieee80211_mle_common_size((u8 *)elems->ml_basic); subelems = (u8 *)elems->ml_basic + common_size; subelems_len = elems->ml_basic_len - common_size; for_each_element_id(sub, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE, subelems, subelems_len) { struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; struct ieee80211_link_data *link; ssize_t len; if (!ieee80211_mle_basic_sta_prof_size_ok(sub->data, sub->datalen)) continue; link_id = le16_get_bits(prof->control, IEEE80211_MLE_STA_CONTROL_LINK_ID); /* need a valid link ID, but also not our own, both AP bugs */ if (link_id == reporting_link_id || link_id >= IEEE80211_MLD_MAX_NUM_LINKS) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; len = cfg80211_defragment_element(sub, subelems, subelems_len, NULL, 0, IEEE80211_MLE_SUBELEM_FRAGMENT); if (WARN_ON(len < 0)) continue; sta_profiles[link_id] = sub; sta_profiles_len[link_id] = len; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_mle_per_sta_profile *prof; struct ieee802_11_elems *prof_elems; struct ieee80211_link_data *link; ssize_t len; if (link_id == reporting_link_id) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (!sta_profiles[link_id]) { prof_elems = NULL; goto handle; } /* we can defragment in-place, won't use the buffer again */ len = cfg80211_defragment_element(sta_profiles[link_id], subelems, subelems_len, (void *)sta_profiles[link_id], sta_profiles_len[link_id], IEEE80211_MLE_SUBELEM_FRAGMENT); if (WARN_ON(len != sta_profiles_len[link_id])) continue; prof = (void *)sta_profiles[link_id]; prof_elems = ieee802_11_parse_elems(prof->variable + (prof->sta_info_len - 1), len - (prof->sta_info_len - 1), false, NULL); /* memory allocation failed - let's hope that's transient */ if (!prof_elems) continue; handle: /* * FIXME: the timings here are obviously incorrect, * but only older Intel drivers seem to care, and * those don't have MLO. If you really need this, * the problem is having to calculate it with the * TSF offset etc. The device_timestamp is still * correct, of course. */ ieee80211_sta_process_chanswitch(link, 0, 0, elems, prof_elems, IEEE80211_CSA_SOURCE_OTHER_LINK); kfree(prof_elems); } } static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, const struct ieee802_11_elems *elems) { struct ieee80211_vif_cfg *cfg = &sdata->vif.cfg; static u8 zero_ssid[IEEE80211_MAX_SSID_LEN]; if (!elems->ssid) return false; /* hidden SSID: zero length */ if (elems->ssid_len == 0) return false; if (elems->ssid_len != cfg->ssid_len) return true; /* hidden SSID: zeroed out */ if (!memcmp(elems->ssid, zero_ssid, elems->ssid_len)) return false; return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); } static bool ieee80211_rx_beacon_freq_valid(struct ieee80211_local *local, struct ieee80211_mgmt *mgmt, struct ieee80211_rx_status *rx_status, struct ieee80211_chanctx_conf *chanctx) { u32 pri_2mhz_khz; struct ieee80211_channel *s1g_sibling_1mhz; u32 pri_khz = ieee80211_channel_to_khz(chanctx->def.chan); u32 rx_khz = ieee80211_rx_status_to_khz(rx_status); if (rx_khz == pri_khz) return true; if (!chanctx->def.s1g_primary_2mhz) return false; /* * If we have an S1G interface with a 2MHz primary, beacons are * sent on the center frequency of the 2MHz primary. Find the sibling * 1MHz channel and calculate the 2MHz primary center frequency. */ s1g_sibling_1mhz = cfg80211_s1g_get_primary_sibling(local->hw.wiphy, &chanctx->def); if (!s1g_sibling_1mhz) return false; pri_2mhz_khz = (pri_khz + ieee80211_channel_to_khz(s1g_sibling_1mhz)) / 2; return rx_khz == pri_2mhz_khz; } static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; struct ieee80211_mgmt *mgmt = (void *) hdr; struct ieee80211_ext *ext = NULL; size_t baselen; struct ieee802_11_elems *elems; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; struct link_sta_info *link_sta; struct sta_info *sta; u64 changed = 0; bool erp_valid; u8 erp_value = 0; u32 ncrc = 0; u8 *bssid, *variable = mgmt->u.beacon.variable; u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; struct ieee80211_elems_parse_params parse_params = { .mode = link->u.mgd.conn.mode, .link_id = -1, .from_ap = true, }; lockdep_assert_wiphy(local->hw.wiphy); /* Process beacon from the current BSS */ bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *)mgmt; variable = ext->u.s1g_beacon.variable + ieee80211_s1g_optional_len(ext->frame_control); } baselen = (u8 *) variable - (u8 *) mgmt; if (baselen > len) return; parse_params.start = variable; parse_params.len = len - baselen; rcu_read_lock(); chanctx_conf = rcu_dereference(bss_conf->chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return; } if (!ieee80211_rx_beacon_freq_valid(local, mgmt, rx_status, chanctx_conf)) { rcu_read_unlock(); return; } chan = chanctx_conf->def.chan; rcu_read_unlock(); if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && !WARN_ON(ieee80211_vif_is_mld(&sdata->vif)) && ieee80211_rx_our_beacon(bssid, ifmgd->assoc_data->link[0].bss)) { parse_params.bss = ifmgd->assoc_data->link[0].bss; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return; ieee80211_rx_bss_info(link, mgmt, len, rx_status); if (elems->dtim_period) link->u.mgd.dtim_period = elems->dtim_period; link->u.mgd.have_beacon = true; ifmgd->assoc_data->need_beacon = false; if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY) && !ieee80211_is_s1g_beacon(hdr->frame_control)) { bss_conf->sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); bss_conf->sync_device_ts = rx_status->device_timestamp; bss_conf->sync_dtim_count = elems->dtim_count; } if (elems->mbssid_config_ie) bss_conf->profile_periodicity = elems->mbssid_config_ie->profile_periodicity; else bss_conf->profile_periodicity = 0; if (elems->ext_capab_len >= 11 && (elems->ext_capab[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) bss_conf->ema_ap = true; else bss_conf->ema_ap = false; /* continue assoc process */ ifmgd->assoc_data->timeout = jiffies; ifmgd->assoc_data->timeout_started = true; run_again(sdata, ifmgd->assoc_data->timeout); kfree(elems); return; } if (!ifmgd->associated || !ieee80211_rx_our_beacon(bssid, bss_conf->bss)) return; bssid = link->u.mgd.bssid; if (!(rx_status->flag & RX_FLAG_NO_SIGNAL_VAL)) ieee80211_handle_beacon_sig(link, ifmgd, bss_conf, local, rx_status); if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) { mlme_dbg_ratelimited(sdata, "cancelling AP probe due to a received beacon\n"); ieee80211_reset_ap_probe(sdata); } /* * Push the beacon loss detection into the future since * we are processing a beacon from the AP just now. */ ieee80211_sta_reset_beacon_monitor(sdata); /* TODO: CRC urrently not calculated on S1G Beacon Compatibility * element (which carries the beacon interval). Don't forget to add a * bit to care_about_ies[] above if mac80211 is interested in a * changing S1G element. */ if (!ieee80211_is_s1g_beacon(hdr->frame_control)) ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); parse_params.bss = bss_conf->bss; parse_params.filter = care_about_ies; parse_params.crc = ncrc; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return; if (rx_status->flag & RX_FLAG_DECRYPTED && ieee80211_mgd_ssid_mismatch(sdata, elems)) { sdata_info(sdata, "SSID mismatch for AP %pM, disconnect\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); return; } ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid, vif_cfg->s1g)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_send_nullfunc(local, sdata, false); } else if (!local->pspolling && sdata->u.mgd.powersave) { local->pspolling = true; /* * Here is assumed that the driver will be * able to send ps-poll frame and receive a * response even though power save mode is * enabled, but some drivers might require * to disable power save here. This needs * to be investigated. */ ieee80211_send_pspoll(local, sdata); } } if (sdata->vif.p2p || sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) { struct ieee80211_p2p_noa_attr noa = {}; int ret; ret = cfg80211_get_p2p_attr(variable, len - baselen, IEEE80211_P2P_ATTR_ABSENCE_NOTICE, (u8 *) &noa, sizeof(noa)); if (ret >= 2) { if (link->u.mgd.p2p_noa_index != noa.index) { /* valid noa_attr and index changed */ link->u.mgd.p2p_noa_index = noa.index; memcpy(&bss_conf->p2p_noa_attr, &noa, sizeof(noa)); changed |= BSS_CHANGED_P2P_PS; /* * make sure we update all information, the CRC * mechanism doesn't look at P2P attributes. */ link->u.mgd.beacon_crc_valid = false; } } else if (link->u.mgd.p2p_noa_index != -1) { /* noa_attr not found and we had valid noa_attr before */ link->u.mgd.p2p_noa_index = -1; memset(&bss_conf->p2p_noa_attr, 0, sizeof(bss_conf->p2p_noa_attr)); changed |= BSS_CHANGED_P2P_PS; link->u.mgd.beacon_crc_valid = false; } } /* * Update beacon timing and dtim count on every beacon appearance. This * will allow the driver to use the most updated values. Do it before * comparing this one with last received beacon. * IMPORTANT: These parameters would possibly be out of sync by the time * the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. */ if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY) && !ieee80211_is_s1g_beacon(hdr->frame_control)) { bss_conf->sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); bss_conf->sync_device_ts = rx_status->device_timestamp; bss_conf->sync_dtim_count = elems->dtim_count; } if ((ncrc == link->u.mgd.beacon_crc && link->u.mgd.beacon_crc_valid) || (ext && ieee80211_is_s1g_short_beacon(ext->frame_control, parse_params.start, parse_params.len))) goto free; link->u.mgd.beacon_crc = ncrc; link->u.mgd.beacon_crc_valid = true; ieee80211_rx_bss_info(link, mgmt, len, rx_status); ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, IEEE80211_CSA_SOURCE_BEACON); /* note that after this elems->ml_basic can no longer be used fully */ ieee80211_mgd_check_cross_link_csa(sdata, rx_status->link_id, elems); ieee80211_mgd_update_bss_param_ch_cnt(sdata, bss_conf, elems); if (!sdata->u.mgd.epcs.enabled && !link->u.mgd.disable_wmm_tracking && ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set)) changed |= BSS_CHANGED_QOS; /* * If we haven't had a beacon before, tell the driver about the * DTIM period (and beacon timing if desired) now. */ if (!link->u.mgd.have_beacon) { /* a few bogus AP send dtim_period = 0 or no TIM IE */ bss_conf->dtim_period = elems->dtim_period ?: 1; changed |= BSS_CHANGED_BEACON_INFO; link->u.mgd.have_beacon = true; ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); } if (elems->erp_info) { erp_valid = true; erp_value = elems->erp_info[0]; } else { erp_valid = false; } if (!ieee80211_is_s1g_beacon(hdr->frame_control)) changed |= ieee80211_handle_bss_capability(link, le16_to_cpu(mgmt->u.beacon.capab_info), erp_valid, erp_value); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!sta)) { goto free; } link_sta = rcu_dereference_protected(sta->link[link->link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (WARN_ON(!link_sta)) { goto free; } if (WARN_ON(!bss_conf->chanreq.oper.chan)) goto free; sband = local->hw.wiphy->bands[bss_conf->chanreq.oper.chan->band]; changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); if (ieee80211_config_bw(link, elems, true, &changed, IEEE80211_STYPE_BEACON)) { ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); ieee80211_report_disconnect(sdata, deauth_buf, sizeof(deauth_buf), true, WLAN_REASON_DEAUTH_LEAVING, false); goto free; } if (elems->opmode_notif) ieee80211_vht_handle_opmode(sdata, link_sta, *elems->opmode_notif, rx_status->band); changed |= ieee80211_handle_pwr_constr(link, chan, mgmt, elems->country_elem, elems->country_elem_len, elems->pwr_constr_elem, elems->cisco_dtpc_elem); ieee80211_ml_reconfiguration(sdata, elems); ieee80211_process_adv_ttlm(sdata, elems, le64_to_cpu(mgmt->u.beacon.timestamp)); ieee80211_link_info_change_notify(sdata, link, changed); free: kfree(elems); } static void ieee80211_apply_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct ieee80211_neg_ttlm neg_ttlm) { u16 new_active_links, new_dormant_links, new_suspended_links, map = 0; u8 i; for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) map |= neg_ttlm.downlink[i] | neg_ttlm.uplink[i]; /* If there is an active TTLM, unset previously suspended links */ if (sdata->vif.neg_ttlm.valid) sdata->vif.dormant_links &= ~sdata->vif.suspended_links; /* exclude links that are already disabled by advertised TTLM */ new_active_links = map & sdata->vif.valid_links & ~sdata->vif.dormant_links; new_suspended_links = (~map & sdata->vif.valid_links) & ~sdata->vif.dormant_links; new_dormant_links = sdata->vif.dormant_links | new_suspended_links; if (ieee80211_ttlm_set_links(sdata, new_active_links, new_dormant_links, new_suspended_links)) return; sdata->vif.neg_ttlm = neg_ttlm; sdata->vif.neg_ttlm.valid = true; } static void ieee80211_neg_ttlm_timeout_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.neg_ttlm_timeout_work.work); sdata_info(sdata, "No negotiated TTLM response from AP, disconnecting.\n"); __ieee80211_disconnect(sdata); } static void ieee80211_neg_ttlm_add_suggested_map(struct sk_buff *skb, struct ieee80211_neg_ttlm *neg_ttlm) { u8 i, direction[IEEE80211_TTLM_MAX_CNT]; if (memcmp(neg_ttlm->downlink, neg_ttlm->uplink, sizeof(neg_ttlm->downlink))) { direction[0] = IEEE80211_TTLM_DIRECTION_DOWN; direction[1] = IEEE80211_TTLM_DIRECTION_UP; } else { direction[0] = IEEE80211_TTLM_DIRECTION_BOTH; } for (i = 0; i < ARRAY_SIZE(direction); i++) { u8 tid, len, map_ind = 0, *len_pos, *map_ind_pos, *pos; __le16 map; len = sizeof(struct ieee80211_ttlm_elem) + 1 + 1; pos = skb_put(skb, len + 2); *pos++ = WLAN_EID_EXTENSION; len_pos = pos++; *pos++ = WLAN_EID_EXT_TID_TO_LINK_MAPPING; *pos++ = direction[i]; map_ind_pos = pos++; for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { map = direction[i] == IEEE80211_TTLM_DIRECTION_UP ? cpu_to_le16(neg_ttlm->uplink[tid]) : cpu_to_le16(neg_ttlm->downlink[tid]); if (!map) continue; len += 2; map_ind |= BIT(tid); skb_put_data(skb, &map, sizeof(map)); } *map_ind_pos = map_ind; *len_pos = len; if (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH) break; } } static void ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_neg_ttlm *neg_ttlm, u8 dialog_token) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_req); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); if (!skb) return; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_req.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_REQ; mgmt->u.action.u.ttlm_req.dialog_token = dialog_token; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); ieee80211_tx_skb(sdata, skb); } int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct cfg80211_ttlm_params *params) { struct ieee80211_neg_ttlm neg_ttlm = {}; u8 i; if (!ieee80211_vif_is_mld(&sdata->vif) || !(sdata->vif.cfg.mld_capa_op & IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP)) return -EINVAL; for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { if ((params->dlink[i] & ~sdata->vif.valid_links) || (params->ulink[i] & ~sdata->vif.valid_links)) return -EINVAL; neg_ttlm.downlink[i] = params->dlink[i]; neg_ttlm.uplink[i] = params->ulink[i]; } if (drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm) != NEG_TTLM_RES_ACCEPT) return -EINVAL; ieee80211_apply_neg_ttlm(sdata, neg_ttlm); sdata->u.mgd.dialog_token_alloc++; ieee80211_send_neg_ttlm_req(sdata, &sdata->vif.neg_ttlm, sdata->u.mgd.dialog_token_alloc); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work); wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work, IEEE80211_NEG_TTLM_REQ_TIMEOUT); return 0; } static void ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, enum ieee80211_neg_ttlm_res ttlm_res, u8 dialog_token, struct ieee80211_neg_ttlm *neg_ttlm) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; u16 status_code; skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); if (!skb) return; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_res.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_RES; mgmt->u.action.u.ttlm_res.dialog_token = dialog_token; switch (ttlm_res) { default: WARN_ON(1); fallthrough; case NEG_TTLM_RES_REJECT: status_code = WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING; break; case NEG_TTLM_RES_ACCEPT: status_code = WLAN_STATUS_SUCCESS; break; case NEG_TTLM_RES_SUGGEST_PREFERRED: status_code = WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); break; } mgmt->u.action.u.ttlm_res.status_code = cpu_to_le16(status_code); ieee80211_tx_skb(sdata, skb); } static int ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ttlm_elem *ttlm, struct ieee80211_neg_ttlm *neg_ttlm, u8 *direction) { u8 control, link_map_presence, map_size, tid; u8 *pos; /* The element size was already validated in * ieee80211_tid_to_link_map_size_ok() */ pos = (void *)ttlm->optional; control = ttlm->control; /* mapping switch time and expected duration fields are not expected * in case of negotiated TTLM */ if (control & (IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT | IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)) { mlme_dbg(sdata, "Invalid TTLM element in negotiated TTLM request\n"); return -EINVAL; } if (control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) { for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { neg_ttlm->downlink[tid] = sdata->vif.valid_links; neg_ttlm->uplink[tid] = sdata->vif.valid_links; } *direction = IEEE80211_TTLM_DIRECTION_BOTH; return 0; } *direction = u8_get_bits(control, IEEE80211_TTLM_CONTROL_DIRECTION); if (*direction != IEEE80211_TTLM_DIRECTION_DOWN && *direction != IEEE80211_TTLM_DIRECTION_UP && *direction != IEEE80211_TTLM_DIRECTION_BOTH) return -EINVAL; link_map_presence = *pos; pos++; if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) map_size = 1; else map_size = 2; for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { u16 map; if (link_map_presence & BIT(tid)) { map = ieee80211_get_ttlm(map_size, pos); if (!map) { mlme_dbg(sdata, "No active links for TID %d", tid); return -EINVAL; } } else { map = 0; } switch (*direction) { case IEEE80211_TTLM_DIRECTION_BOTH: neg_ttlm->downlink[tid] = map; neg_ttlm->uplink[tid] = map; break; case IEEE80211_TTLM_DIRECTION_DOWN: neg_ttlm->downlink[tid] = map; break; case IEEE80211_TTLM_DIRECTION_UP: neg_ttlm->uplink[tid] = map; break; default: return -EINVAL; } pos += map_size; } return 0; } void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { u8 dialog_token, direction[IEEE80211_TTLM_MAX_CNT] = {}, i; size_t ies_len; enum ieee80211_neg_ttlm_res ttlm_res = NEG_TTLM_RES_ACCEPT; struct ieee802_11_elems *elems = NULL; struct ieee80211_neg_ttlm neg_ttlm = {}; BUILD_BUG_ON(ARRAY_SIZE(direction) != ARRAY_SIZE(elems->ttlm)); if (!ieee80211_vif_is_mld(&sdata->vif)) return; dialog_token = mgmt->u.action.u.ttlm_req.dialog_token; ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.ttlm_req.variable); elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, ies_len, true, NULL); if (!elems) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } for (i = 0; i < elems->ttlm_num; i++) { if (ieee80211_parse_neg_ttlm(sdata, elems->ttlm[i], &neg_ttlm, &direction[i]) || (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH && elems->ttlm_num != 1)) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } } if (!elems->ttlm_num || (elems->ttlm_num == 2 && direction[0] == direction[1])) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { if ((neg_ttlm.downlink[i] && (neg_ttlm.downlink[i] & ~sdata->vif.valid_links)) || (neg_ttlm.uplink[i] && (neg_ttlm.uplink[i] & ~sdata->vif.valid_links))) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } } ttlm_res = drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm); if (ttlm_res != NEG_TTLM_RES_ACCEPT) goto out; ieee80211_apply_neg_ttlm(sdata, neg_ttlm); out: kfree(elems); ieee80211_send_neg_ttlm_res(sdata, ttlm_res, dialog_token, &neg_ttlm); } void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { if (!ieee80211_vif_is_mld(&sdata->vif) || mgmt->u.action.u.ttlm_req.dialog_token != sdata->u.mgd.dialog_token_alloc) return; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work); /* MLD station sends a TID to link mapping request, mainly to handle * BTM (BSS transition management) request, in which case it needs to * restrict the active links set. * In this case it's not expected that the MLD AP will reject the * negotiated TTLM request. * This can be better implemented in the future, to handle request * rejections. */ if (le16_to_cpu(mgmt->u.action.u.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) __ieee80211_disconnect(sdata); } void ieee80211_process_ttlm_teardown(struct ieee80211_sub_if_data *sdata) { u16 new_dormant_links; if (!sdata->vif.neg_ttlm.valid) return; memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); new_dormant_links = sdata->vif.dormant_links & ~sdata->vif.suspended_links; sdata->vif.suspended_links = 0; ieee80211_vif_set_links(sdata, sdata->vif.valid_links, new_dormant_links); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_TTLM | BSS_CHANGED_MLD_VALID_LINKS); } static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.teardown_ttlm_work); ieee80211_process_ttlm_teardown(sdata); } void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int frame_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_tear_down); struct ieee80211_tx_info *info; skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = skb_put_zero(skb, frame_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_tear_down.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; info->status_data = IEEE80211_STATUS_TYPE_NEG_TTLM; ieee80211_tx_skb(sdata, skb); } EXPORT_SYMBOL(ieee80211_send_teardown_neg_ttlm); void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_rx_status *rx_status; struct ieee80211_hdr *hdr; u16 fc; lockdep_assert_wiphy(sdata->local->hw.wiphy); rx_status = (struct ieee80211_rx_status *) skb->cb; hdr = (struct ieee80211_hdr *) skb->data; fc = le16_to_cpu(hdr->frame_control); switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_S1G_BEACON: ieee80211_rx_mgmt_beacon(link, hdr, skb->len, rx_status); break; } } void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; struct ieee80211_mgmt *mgmt; u16 fc; int ies_len; lockdep_assert_wiphy(sdata->local->hw.wiphy); rx_status = (struct ieee80211_rx_status *) skb->cb; mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); if (rx_status->link_valid) { link = sdata_dereference(sdata->link[rx_status->link_id], sdata); if (!link) return; } switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_BEACON: ieee80211_rx_mgmt_beacon(link, (void *)mgmt, skb->len, rx_status); break; case IEEE80211_STYPE_PROBE_RESP: ieee80211_rx_mgmt_probe_resp(link, skb); break; case IEEE80211_STYPE_AUTH: ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DEAUTH: ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DISASSOC: ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ASSOC_RESP: case IEEE80211_STYPE_REASSOC_RESP: ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ACTION: if (!sdata->u.mgd.associated || !ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) break; switch (mgmt->u.action.category) { case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); if (ies_len < 0) break; /* CSA IE cannot be overridden, no need for BSSID */ elems = ieee802_11_parse_elems( mgmt->u.action.u.chan_switch.variable, ies_len, true, NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src = IEEE80211_CSA_SOURCE_PROT_ACTION; ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, src); } kfree(elems); break; case WLAN_CATEGORY_PUBLIC: case WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION: ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.ext_chan_switch.variable); if (ies_len < 0) break; /* * extended CSA IE can't be overridden, no need for * BSSID */ elems = ieee802_11_parse_elems( mgmt->u.action.u.ext_chan_switch.variable, ies_len, true, NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src; if (mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION) src = IEEE80211_CSA_SOURCE_PROT_ACTION; else src = IEEE80211_CSA_SOURCE_UNPROT_ACTION; /* for the handling code pretend it was an IE */ elems->ext_chansw_ie = &mgmt->u.action.u.ext_chan_switch.data; ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, src); } kfree(elems); break; } break; } } static void ieee80211_sta_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = timer_container_of(sdata, t, u.mgd.timer); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, u8 reason, bool tx) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, reason, tx, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, reason, false); } static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data = ifmgd->auth_data; u32 tx_flags = 0; u16 trans = 1; u16 status = 0; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON_ONCE(!auth_data)) return -EINVAL; auth_data->tries++; if (auth_data->tries > IEEE80211_AUTH_MAX_TRIES) { sdata_info(sdata, "authentication with %pM timed out\n", auth_data->ap_addr); /* * Most likely AP is not in the range so remove the * bss struct for that AP. */ cfg80211_unlink_bss(local->hw.wiphy, auth_data->bss); return -ETIMEDOUT; } if (auth_data->algorithm == WLAN_AUTH_SAE) info.duration = jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE); info.link_id = auth_data->link_id; drv_mgd_prepare_tx(local, sdata, &info); sdata_info(sdata, "send auth to %pM (try %d/%d)\n", auth_data->ap_addr, auth_data->tries, IEEE80211_AUTH_MAX_TRIES); auth_data->expected_transaction = 2; if (auth_data->algorithm == WLAN_AUTH_SAE) { trans = auth_data->sae_trans; status = auth_data->sae_status; auth_data->expected_transaction = trans; } if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, trans, auth_data->algorithm, status, auth_data->data, auth_data->data_len, auth_data->ap_addr, auth_data->ap_addr, NULL, 0, 0, tx_flags); if (tx_flags == 0) { if (auth_data->algorithm == WLAN_AUTH_SAE) auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SAE; else auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; } else { auth_data->timeout = round_jiffies_up(jiffies + IEEE80211_AUTH_TIMEOUT_LONG); } auth_data->timeout_started = true; run_again(sdata, auth_data->timeout); return 0; } static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; struct ieee80211_local *local = sdata->local; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); assoc_data->tries++; assoc_data->comeback = false; if (assoc_data->tries > IEEE80211_ASSOC_MAX_TRIES) { sdata_info(sdata, "association with %pM timed out\n", assoc_data->ap_addr); /* * Most likely AP is not in the range so remove the * bss struct for that AP. */ cfg80211_unlink_bss(local->hw.wiphy, assoc_data->link[assoc_data->assoc_link_id].bss); return -ETIMEDOUT; } sdata_info(sdata, "associate with %pM (try %d/%d)\n", assoc_data->ap_addr, assoc_data->tries, IEEE80211_ASSOC_MAX_TRIES); ret = ieee80211_send_assoc(sdata); if (ret) return ret; if (!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); } else { assoc_data->timeout = round_jiffies_up(jiffies + IEEE80211_ASSOC_TIMEOUT_LONG); assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); } return 0; } vo